4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2012, 2015, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
32 * lustre/ofd/ofd_dlm.c
34 * This file contains OBD Filter Device (OFD) LDLM-related code which is just
35 * intent handling for glimpse lock.
37 * Author: Andreas Dilger <andreas.dilger@intel.com>
38 * Author: Jinshan Xiong <jinshan.xiong@intel.com>
39 * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
40 * Author: Mikhail Pershin <mike.pershin@intel.com>
43 #define DEBUG_SUBSYSTEM S_FILTER
45 #include "ofd_internal.h"
47 struct ofd_intent_args {
48 struct list_head gl_list;
54 int ofd_dlm_init(void)
56 ldlm_glimpse_work_kmem = kmem_cache_create("ldlm_glimpse_work_kmem",
57 sizeof(struct ldlm_glimpse_work),
59 if (ldlm_glimpse_work_kmem == NULL)
65 void ofd_dlm_exit(void)
67 if (ldlm_glimpse_work_kmem) {
68 kmem_cache_destroy(ldlm_glimpse_work_kmem);
69 ldlm_glimpse_work_kmem = NULL;
74 * OFD interval callback.
76 * The interval_callback_t is part of interval_iterate_reverse() and is called
77 * for each interval in tree. The OFD interval callback searches for locks
78 * covering extents beyond the given args->size. This is used to decide if the
79 * size is too small and needs to be updated. Note that we are only interested
80 * in growing the size, as truncate is the only operation which can shrink it,
81 * and it is handled differently. This is why we only look at locks beyond the
84 * It finds the highest lock (by starting point) in this interval, and adds it
85 * to the list of locks to glimpse. We must glimpse a list of locks - rather
86 * than only the highest lock on the file - because lockahead creates extent
87 * locks in advance of IO, and so breaks the assumption that the holder of the
88 * highest lock knows the current file size.
90 * This assumption is normally true because locks which are created as part of
91 * IO - rather than in advance of it - are guaranteed to be 'active', i.e.,
92 * involved in IO, and the holder of the highest 'active' lock always knows the
93 * current file size, because the size is either not changing or the holder of
94 * that lock is responsible for updating it.
96 * So we need only glimpse until we find the first client with an 'active'
99 * Unfortunately, there is no way to know if a manually requested/speculative
100 * lock is 'active' from the server side. So when we see a potentially
101 * speculative lock, we must send a glimpse for that lock unless we have
102 * already sent a glimpse to the holder of that lock.
104 * However, *all* non-speculative locks are active. So we can stop glimpsing
105 * as soon as we find a non-speculative lock. Currently, all speculative PW
106 * locks have LDLM_FL_NO_EXPANSION set, and we use this to identify them. This
107 * is enforced by an assertion in osc_lock_init, which references this comment.
109 * If that ever changes, we will either need to find a new way to identify
110 * active locks or we will need to consider all PW locks (we will still only
111 * glimpse one per client).
113 * Note that it is safe to glimpse only the 'top' lock from each interval
114 * because ofd_intent_cb is only called for PW extent locks, and for PW locks,
115 * there is only one lock per interval.
117 * \param[in] n interval node
118 * \param[in,out] args intent arguments, gl work list for identified locks
120 * \retval INTERVAL_ITER_STOP if the interval is lower than
121 * file size, caller stops execution
122 * \retval INTERVAL_ITER_CONT if callback finished successfully
123 * and caller may continue execution
125 static enum interval_iter ofd_intent_cb(struct interval_node *n, void *args)
127 struct ldlm_interval *node = (struct ldlm_interval *)n;
128 struct ofd_intent_args *arg = args;
129 __u64 size = arg->size;
130 struct ldlm_lock *victim_lock = NULL;
131 struct ldlm_lock *lck;
132 struct ldlm_glimpse_work *gl_work = NULL;
135 /* If the interval is lower than the current file size, just break. */
136 if (interval_high(n) <= size)
137 GOTO(out, rc = INTERVAL_ITER_STOP);
139 /* Find the 'victim' lock from this interval */
140 list_for_each_entry(lck, &node->li_group, l_sl_policy) {
142 victim_lock = LDLM_LOCK_GET(lck);
144 /* the same policy group - every lock has the
145 * same extent, so needn't do it any more */
149 /* l_export can be null in race with eviction - In that case, we will
150 * not find any locks in this interval */
152 GOTO(out, rc = INTERVAL_ITER_CONT);
155 * This check is for lock taken in ofd_destroy_by_fid() that does
156 * not have l_glimpse_ast set. So the logic is: if there is a lock
157 * with no l_glimpse_ast set, this object is being destroyed already.
158 * Hence, if you are grabbing DLM locks on the server, always set
159 * non-NULL glimpse_ast (e.g., ldlm_request.c::ldlm_glimpse_ast()).
161 if (victim_lock->l_glimpse_ast == NULL) {
162 LDLM_DEBUG(victim_lock, "no l_glimpse_ast");
163 arg->no_glimpse_ast = true;
164 GOTO(out_release, rc = INTERVAL_ITER_STOP);
167 /* If NO_EXPANSION is not set, this is an active lock, and we don't need
168 * to glimpse any further once we've glimpsed the client holding this
169 * lock. So set us up to stop. See comment above this function. */
170 if (!(victim_lock->l_flags & LDLM_FL_NO_EXPANSION))
171 rc = INTERVAL_ITER_STOP;
173 rc = INTERVAL_ITER_CONT;
175 /* Check to see if we're already set up to send a glimpse to this
176 * client; if so, don't add this lock to the glimpse list - We need
177 * only glimpse each client once. (And if we know that client holds
178 * an active lock, we can stop glimpsing. So keep the rc set in the
180 list_for_each_entry(gl_work, &arg->gl_list, gl_list) {
181 if (gl_work->gl_lock->l_export == victim_lock->l_export)
182 GOTO(out_release, rc);
185 if (!OBD_FAIL_CHECK(OBD_FAIL_OST_GL_WORK_ALLOC))
186 OBD_SLAB_ALLOC_PTR_GFP(gl_work, ldlm_glimpse_work_kmem,
190 arg->error = -ENOMEM;
191 GOTO(out_release, rc = INTERVAL_ITER_STOP);
194 /* Populate the gl_work structure. */
195 gl_work->gl_lock = victim_lock;
196 list_add_tail(&gl_work->gl_list, &arg->gl_list);
197 /* There is actually no need for a glimpse descriptor when glimpsing
199 gl_work->gl_desc = NULL;
200 /* This tells ldlm_work_gl_ast_lock this was allocated from a slab and
201 * must be freed in a slab-aware manner. */
202 gl_work->gl_flags = LDLM_GL_WORK_SLAB_ALLOCATED;
207 /* If the victim doesn't go on the glimpse list, we must release it */
208 LDLM_LOCK_RELEASE(victim_lock);
214 * OFD lock intent policy
216 * This defines ldlm_namespace::ns_policy interface for OFD.
217 * Intent policy is called when lock has an intent, for OFD that
218 * means glimpse lock and policy fills Lock Value Block (LVB).
220 * If already granted lock is found it will be placed in \a lockp and
221 * returned back to caller function.
223 * \param[in] ns namespace
224 * \param[in,out] lockp pointer to the lock
225 * \param[in] req_cookie incoming request
226 * \param[in] mode LDLM mode
227 * \param[in] flags LDLM flags
228 * \param[in] data opaque data, not used in OFD policy
230 * \retval ELDLM_LOCK_REPLACED if already granted lock was found
231 * and placed in \a lockp
232 * \retval ELDLM_LOCK_ABORTED in other cases except error
233 * \retval negative errno on error
235 int ofd_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp,
236 void *req_cookie, enum ldlm_mode mode, __u64 flags,
239 struct ptlrpc_request *req = req_cookie;
240 struct ldlm_lock *lock = *lockp;
241 struct ldlm_resource *res = lock->l_resource;
242 ldlm_processing_policy policy;
243 struct ost_lvb *res_lvb, *reply_lvb;
244 struct ldlm_reply *rep;
247 struct ldlm_interval_tree *tree;
248 struct ofd_intent_args arg;
250 [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
251 [DLM_LOCKREPLY_OFF] = sizeof(*rep),
252 [DLM_REPLY_REC_OFF] = sizeof(*reply_lvb)
254 struct ldlm_glimpse_work *pos, *tmp;
257 INIT_LIST_HEAD(&arg.gl_list);
258 arg.no_glimpse_ast = false;
260 lock->l_lvb_type = LVB_T_OST;
261 policy = ldlm_get_processing_policy(res);
262 LASSERT(policy != NULL);
263 LASSERT(req != NULL);
265 rc = lustre_pack_reply(req, 3, repsize, NULL);
267 RETURN(req->rq_status = rc);
269 rep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF, sizeof(*rep));
270 LASSERT(rep != NULL);
272 reply_lvb = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF,
274 LASSERT(reply_lvb != NULL);
276 /* Call the extent policy function to see if our request can be
277 * granted, or is blocked.
278 * If the OST lock has LDLM_FL_HAS_INTENT set, it means a glimpse
279 * lock, and should not be granted if the lock will be blocked.
282 if (flags & LDLM_FL_BLOCK_NOWAIT) {
283 OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_AGL_DELAY, 5);
285 if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_AGL_NOLOCK))
286 RETURN(ELDLM_LOCK_ABORTED);
289 LASSERT(ns == ldlm_res_to_ns(res));
292 /* Check if this is a resend case (MSG_RESENT is set on RPC) and a
293 * lock was found by ldlm_handle_enqueue(); if so no need to grant
295 if (flags & LDLM_FL_RESENT) {
296 rc = LDLM_ITER_CONTINUE;
299 rc = policy(lock, &tmpflags, LDLM_PROCESS_RESCAN, &err, NULL);
300 check_res_locked(res);
303 /* The lock met with no resistance; we're finished. */
304 if (rc == LDLM_ITER_CONTINUE) {
305 if (OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_GLIMPSE, 2)) {
306 ldlm_resource_unlink_lock(lock);
307 err = ELDLM_LOCK_ABORTED;
309 err = ELDLM_LOCK_REPLACED;
313 } else if (flags & LDLM_FL_BLOCK_NOWAIT) {
314 /* LDLM_FL_BLOCK_NOWAIT means it is for AGL. Do not send glimpse
315 * callback for glimpse size. The real size user will trigger
316 * the glimpse callback when necessary. */
318 RETURN(ELDLM_LOCK_ABORTED);
321 /* Do not grant any lock, but instead send GL callbacks. The extent
322 * policy nicely created a list of all PW locks for us. We will choose
323 * the highest of those which are larger than the size in the LVB, if
324 * any, and perform a glimpse callback. */
325 res_lvb = res->lr_lvb_data;
326 LASSERT(res_lvb != NULL);
327 *reply_lvb = *res_lvb;
330 * ->ns_lock guarantees that no new locks are granted, and,
331 * therefore, that res->lr_lvb_data cannot increase beyond the
332 * end of already granted lock. As a result, it is safe to
333 * check against "stale" reply_lvb->lvb_size value without
336 arg.size = reply_lvb->lvb_size;
338 /* Check for PW locks beyond the size in the LVB, build the list
339 * of locks to glimpse (arg.gl_list) */
340 for (idx = 0; idx < LCK_MODE_NUM; idx++) {
341 tree = &res->lr_itree[idx];
342 if (tree->lit_mode == LCK_PR)
345 interval_iterate_reverse(tree->lit_root, ofd_intent_cb, &arg);
348 GOTO(out, rc = arg.error);
353 /* There were no PW locks beyond the size in the LVB; finished. */
354 if (list_empty(&arg.gl_list))
355 RETURN(ELDLM_LOCK_ABORTED);
357 if (arg.no_glimpse_ast) {
358 /* We are racing with unlink(); just return -ENOENT */
359 rep->lock_policy_res1 = ptlrpc_status_hton(-ENOENT);
360 GOTO(out, ELDLM_LOCK_ABORTED);
363 /* this will update the LVB */
364 ldlm_glimpse_locks(res, &arg.gl_list);
367 *reply_lvb = *res_lvb;
371 /* If the list is not empty, we failed to glimpse some locks and
372 * must clean up. Usually due to a race with unlink.*/
373 list_for_each_entry_safe(pos, tmp, &arg.gl_list, gl_list) {
374 list_del(&pos->gl_list);
375 LDLM_LOCK_RELEASE(pos->gl_lock);
376 OBD_SLAB_FREE_PTR(pos, ldlm_glimpse_work_kmem);
379 RETURN(rc < 0 ? rc : ELDLM_LOCK_ABORTED);