Whamcloud - gitweb
LU-11926 ldlm: Lost lease lock on migrate error
[fs/lustre-release.git] / lustre / ldlm / ldlm_request.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2010, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  */
32 /**
33  * This file contains Asynchronous System Trap (AST) handlers and related
34  * LDLM request-processing routines.
35  *
36  * An AST is a callback issued on a lock when its state is changed. There are
37  * several different types of ASTs (callbacks) registered for each lock:
38  *
39  * - completion AST: when a lock is enqueued by some process, but cannot be
40  *   granted immediately due to other conflicting locks on the same resource,
41  *   the completion AST is sent to notify the caller when the lock is
42  *   eventually granted
43  *
44  * - blocking AST: when a lock is granted to some process, if another process
45  *   enqueues a conflicting (blocking) lock on a resource, a blocking AST is
46  *   sent to notify the holder(s) of the lock(s) of the conflicting lock
47  *   request. The lock holder(s) must release their lock(s) on that resource in
48  *   a timely manner or be evicted by the server.
49  *
50  * - glimpse AST: this is used when a process wants information about a lock
51  *   (i.e. the lock value block (LVB)) but does not necessarily require holding
52  *   the lock. If the resource is locked, the lock holder(s) are sent glimpse
53  *   ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL
54  *   their lock(s) if they are idle. If the resource is not locked, the server
55  *   may grant the lock.
56  */
57
58 #define DEBUG_SUBSYSTEM S_LDLM
59
60 #include <lustre_errno.h>
61 #include <lustre_dlm.h>
62 #include <obd_class.h>
63 #include <obd.h>
64
65 #include "ldlm_internal.h"
66
67 unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
68 module_param(ldlm_enqueue_min, uint, 0644);
69 MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum");
70
71 /* in client side, whether the cached locks will be canceled before replay */
72 unsigned int ldlm_cancel_unused_locks_before_replay = 1;
73
74 static void interrupted_completion_wait(void *data)
75 {
76 }
77
78 struct lock_wait_data {
79         struct ldlm_lock *lwd_lock;
80         __u32             lwd_conn_cnt;
81 };
82
83 struct ldlm_async_args {
84         struct lustre_handle lock_handle;
85 };
86
87 /**
88  * ldlm_request_bufsize
89  *
90  * If opcode=LDLM_ENQUEUE, 1 slot is already occupied,
91  * LDLM_LOCKREQ_HANDLE -1 slots are available.
92  * Otherwise, LDLM_LOCKREQ_HANDLE slots are available.
93  *
94  * \param[in] count
95  * \param[in] type
96  *
97  * \retval size of the request buffer
98  */
99
100 int ldlm_request_bufsize(int count, int type)
101 {
102         int avail = LDLM_LOCKREQ_HANDLES;
103         if (type == LDLM_ENQUEUE)
104                 avail -= LDLM_ENQUEUE_CANCEL_OFF;
105
106         if (count > avail)
107                 avail = (count - avail) * sizeof(struct lustre_handle);
108         else
109                 avail = 0;
110
111         return sizeof(struct ldlm_request) + avail;
112 }
113
114 int ldlm_expired_completion_wait(void *data)
115 {
116         struct lock_wait_data *lwd = data;
117         struct ldlm_lock *lock = lwd->lwd_lock;
118         struct obd_import *imp;
119         struct obd_device *obd;
120
121         ENTRY;
122         if (lock->l_conn_export == NULL) {
123                 static time64_t next_dump, last_dump;
124
125                 LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago); "
126                            "not entering recovery in server code, just going back to sleep",
127                            (s64)lock->l_activity,
128                            (s64)(ktime_get_real_seconds() -
129                                  lock->l_activity));
130                 if (ktime_get_seconds() > next_dump) {
131                         last_dump = next_dump;
132                         next_dump = ktime_get_seconds() + 300;
133                         ldlm_namespace_dump(D_DLMTRACE,
134                                             ldlm_lock_to_ns(lock));
135                         if (last_dump == 0)
136                                 libcfs_debug_dumplog();
137                 }
138                 RETURN(0);
139         }
140
141         obd = lock->l_conn_export->exp_obd;
142         imp = obd->u.cli.cl_import;
143         ptlrpc_fail_import(imp, lwd->lwd_conn_cnt);
144         LDLM_ERROR(lock, "lock timed out (enqueued at %lld, %llds ago), entering recovery for %s@%s",
145                   (s64)lock->l_activity,
146                   (s64)(ktime_get_real_seconds() - lock->l_activity),
147                   obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid);
148
149         RETURN(0);
150 }
151
152 /**
153  * Calculate the Completion timeout (covering enqueue, BL AST, data flush,
154  * lock cancel, and their replies). Used for lock completion timeout on the
155  * client side.
156  *
157  * \param[in] lock        lock which is waiting the completion callback
158  *
159  * \retval            timeout in seconds to wait for the server reply
160  */
161
162 /* We use the same basis for both server side and client side functions
163    from a single node. */
164 static time64_t ldlm_cp_timeout(struct ldlm_lock *lock)
165 {
166         time64_t timeout;
167
168         if (AT_OFF)
169                 return obd_timeout;
170
171         /* Wait a long time for enqueue - server may have to callback a
172          * lock from another client.  Server will evict the other client if it
173          * doesn't respond reasonably, and then give us the lock. */
174         timeout = at_get(ldlm_lock_to_ns_at(lock));
175         return max(3 * timeout, (time64_t) ldlm_enqueue_min);
176 }
177
178 /**
179  * Helper function for ldlm_completion_ast(), updating timings when lock is
180  * actually granted.
181  */
182 static int ldlm_completion_tail(struct ldlm_lock *lock, void *data)
183 {
184         time64_t delay;
185         int result = 0;
186
187         if (ldlm_is_destroyed(lock) || ldlm_is_failed(lock)) {
188                 LDLM_DEBUG(lock, "client-side enqueue: destroyed");
189                 result = -EIO;
190         } else if (data == NULL) {
191                 LDLM_DEBUG(lock, "client-side enqueue: granted");
192         } else {
193                 /* Take into AT only CP RPC, not immediately granted locks */
194                 delay = ktime_get_real_seconds() - lock->l_activity;
195                 LDLM_DEBUG(lock, "client-side enqueue: granted after %llds",
196                            (s64)delay);
197
198                 /* Update our time estimate */
199                 at_measured(ldlm_lock_to_ns_at(lock), delay);
200         }
201         return result;
202 }
203
204 /**
205  * Implementation of ->l_completion_ast() for a client, that doesn't wait
206  * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
207  * other threads that cannot block for long.
208  */
209 int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data)
210 {
211         ENTRY;
212
213         if (flags == LDLM_FL_WAIT_NOREPROC) {
214                 LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
215                 RETURN(0);
216         }
217
218         if (!(flags & LDLM_FL_BLOCKED_MASK)) {
219                 wake_up(&lock->l_waitq);
220                 RETURN(ldlm_completion_tail(lock, data));
221         }
222
223         LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
224                    "going forward");
225         ldlm_reprocess_all(lock->l_resource);
226         RETURN(0);
227 }
228 EXPORT_SYMBOL(ldlm_completion_ast_async);
229
230 /**
231  * Generic LDLM "completion" AST. This is called in several cases:
232  *
233  *     - when a reply to an ENQUEUE RPC is received from the server
234  *       (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at
235  *       this point (determined by flags);
236  *
237  *     - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has
238  *       been granted;
239  *
240  *     - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock
241  *       gets correct lvb;
242  *
243  *     - to force all locks when resource is destroyed (cleanup_resource());
244  *
245  * If lock is not granted in the first case, this function waits until second
246  * or penultimate cases happen in some other thread.
247  *
248  */
249 int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data)
250 {
251         /* XXX ALLOCATE - 160 bytes */
252         struct lock_wait_data lwd;
253         struct obd_device *obd;
254         struct obd_import *imp = NULL;
255         struct l_wait_info lwi;
256         time64_t timeout;
257         int rc = 0;
258         ENTRY;
259
260         if (flags == LDLM_FL_WAIT_NOREPROC) {
261                 LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock");
262                 goto noreproc;
263         }
264
265         if (!(flags & LDLM_FL_BLOCKED_MASK)) {
266                 wake_up(&lock->l_waitq);
267                 RETURN(0);
268         }
269
270         LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
271                    "sleeping");
272
273 noreproc:
274
275         obd = class_exp2obd(lock->l_conn_export);
276
277         /* if this is a local lock, then there is no import */
278         if (obd != NULL) {
279                 imp = obd->u.cli.cl_import;
280         }
281
282         timeout = ldlm_cp_timeout(lock);
283
284         lwd.lwd_lock = lock;
285         lock->l_activity = ktime_get_real_seconds();
286
287         if (ldlm_is_no_timeout(lock)) {
288                 LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT");
289                 lwi = LWI_INTR(interrupted_completion_wait, &lwd);
290         } else {
291                 lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
292                                        ldlm_expired_completion_wait,
293                                        interrupted_completion_wait, &lwd);
294         }
295
296         if (imp != NULL) {
297                 spin_lock(&imp->imp_lock);
298                 lwd.lwd_conn_cnt = imp->imp_conn_cnt;
299                 spin_unlock(&imp->imp_lock);
300         }
301
302         if (ns_is_client(ldlm_lock_to_ns(lock)) &&
303             OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
304                                  OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
305                 ldlm_set_fail_loc(lock);
306                 rc = -EINTR;
307         } else {
308                 /* Go to sleep until the lock is granted or cancelled. */
309                 rc = l_wait_event(lock->l_waitq,
310                                   is_granted_or_cancelled(lock), &lwi);
311         }
312
313         if (rc) {
314                 LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)",
315                            rc);
316                 RETURN(rc);
317         }
318
319         RETURN(ldlm_completion_tail(lock, data));
320 }
321 EXPORT_SYMBOL(ldlm_completion_ast);
322
323 /**
324  * A helper to build a blocking AST function
325  *
326  * Perform a common operation for blocking ASTs:
327  * defferred lock cancellation.
328  *
329  * \param lock the lock blocking or canceling AST was called on
330  * \retval 0
331  * \see mdt_blocking_ast
332  * \see ldlm_blocking_ast
333  */
334 int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
335 {
336         int do_ast;
337         ENTRY;
338
339         ldlm_set_cbpending(lock);
340         do_ast = (!lock->l_readers && !lock->l_writers);
341         unlock_res_and_lock(lock);
342
343         if (do_ast) {
344                 struct lustre_handle lockh;
345                 int rc;
346
347                 LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
348                 ldlm_lock2handle(lock, &lockh);
349                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
350                 if (rc < 0)
351                         CERROR("ldlm_cli_cancel: %d\n", rc);
352         } else {
353                 LDLM_DEBUG(lock, "Lock still has references, will be "
354                            "cancelled later");
355         }
356         RETURN(0);
357 }
358 EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
359
360 /**
361  * Server blocking AST
362  *
363  * ->l_blocking_ast() callback for LDLM locks acquired by server-side
364  * OBDs.
365  *
366  * \param lock the lock which blocks a request or cancelling lock
367  * \param desc unused
368  * \param data unused
369  * \param flag indicates whether this cancelling or blocking callback
370  * \retval 0
371  * \see ldlm_blocking_ast_nocheck
372  */
373 int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
374                       void *data, int flag)
375 {
376         ENTRY;
377
378         if (flag == LDLM_CB_CANCELING) {
379                 /* Don't need to do anything here. */
380                 RETURN(0);
381         }
382
383         lock_res_and_lock(lock);
384         /* Get this: if ldlm_blocking_ast is racing with intent_policy, such
385          * that ldlm_blocking_ast is called just before intent_policy method
386          * takes the lr_lock, then by the time we get the lock, we might not
387          * be the correct blocking function anymore.  So check, and return
388          * early, if so. */
389         if (lock->l_blocking_ast != ldlm_blocking_ast) {
390                 unlock_res_and_lock(lock);
391                 RETURN(0);
392         }
393         RETURN(ldlm_blocking_ast_nocheck(lock));
394 }
395 EXPORT_SYMBOL(ldlm_blocking_ast);
396
397 /**
398  * Implements ldlm_lock::l_glimpse_ast for extent locks acquired on the server.
399  *
400  * Returning -ELDLM_NO_LOCK_DATA actually works, but the reason for that is
401  * rather subtle: with OST-side locking, it may so happen that _all_ extent
402  * locks are held by the OST. If client wants to obtain the current file size
403  * it calls ll_glimpse_size(), and (as all locks are held only on the server),
404  * this dummy glimpse callback fires and does nothing. The client still
405  * receives the correct file size due to the following fragment of code in
406  * ldlm_cb_interpret():
407  *
408  *      if (rc == -ELDLM_NO_LOCK_DATA) {
409  *              LDLM_DEBUG(lock, "lost race - client has a lock but no"
410  *                         "inode");
411  *              ldlm_res_lvbo_update(lock->l_resource, NULL, 1);
412  *      }
413  *
414  * That is, after the glimpse returns this error, ofd_lvbo_update() is called
415  * and returns the updated file attributes from the inode to the client.
416  *
417  * See also comment in ofd_intent_policy() on why servers must set a non-NULL
418  * l_glimpse_ast when grabbing DLM locks.  Otherwise, the server will assume
419  * that the object is in the process of being destroyed.
420  *
421  * \param[in] lock      DLM lock being glimpsed, unused
422  * \param[in] reqp      pointer to ptlrpc_request, unused
423  *
424  * \retval              -ELDLM_NO_LOCK_DATA to get attributes from disk object
425  */
426 int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
427 {
428         return -ELDLM_NO_LOCK_DATA;
429 }
430
431 /**
432  * Enqueue a local lock (typically on a server).
433  */
434 int ldlm_cli_enqueue_local(const struct lu_env *env,
435                            struct ldlm_namespace *ns,
436                            const struct ldlm_res_id *res_id,
437                            enum ldlm_type type, union ldlm_policy_data *policy,
438                            enum ldlm_mode mode, __u64 *flags,
439                            ldlm_blocking_callback blocking,
440                            ldlm_completion_callback completion,
441                            ldlm_glimpse_callback glimpse,
442                            void *data, __u32 lvb_len, enum lvb_type lvb_type,
443                            const __u64 *client_cookie,
444                            struct lustre_handle *lockh)
445 {
446         struct ldlm_lock *lock;
447         int err;
448         const struct ldlm_callback_suite cbs = { .lcs_completion = completion,
449                                                  .lcs_blocking   = blocking,
450                                                  .lcs_glimpse    = glimpse,
451         };
452         ENTRY;
453
454         LASSERT(!(*flags & LDLM_FL_REPLAY));
455         if (unlikely(ns_is_client(ns))) {
456                 CERROR("Trying to enqueue local lock in a shadow namespace\n");
457                 LBUG();
458         }
459
460         lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len,
461                                 lvb_type);
462         if (IS_ERR(lock))
463                 GOTO(out_nolock, err = PTR_ERR(lock));
464
465         err = ldlm_lvbo_init(env, lock->l_resource);
466         if (err < 0) {
467                 LDLM_ERROR(lock, "delayed lvb init failed (rc %d)", err);
468                 ldlm_lock_destroy_nolock(lock);
469                 GOTO(out, err);
470         }
471
472         ldlm_lock2handle(lock, lockh);
473
474         /* NB: we don't have any lock now (lock_res_and_lock)
475          * because it's a new lock */
476         ldlm_lock_addref_internal_nolock(lock, mode);
477         ldlm_set_local(lock);
478         if (*flags & LDLM_FL_ATOMIC_CB)
479                 ldlm_set_atomic_cb(lock);
480
481         if (*flags & LDLM_FL_CANCEL_ON_BLOCK)
482                 ldlm_set_cancel_on_block(lock);
483
484         if (policy != NULL)
485                 lock->l_policy_data = *policy;
486         if (client_cookie != NULL)
487                 lock->l_client_cookie = *client_cookie;
488         if (type == LDLM_EXTENT) {
489                 /* extent lock without policy is a bug */
490                 if (policy == NULL)
491                         LBUG();
492
493                 lock->l_req_extent = policy->l_extent;
494         }
495
496         err = ldlm_lock_enqueue(env, ns, &lock, policy, flags);
497         if (unlikely(err != ELDLM_OK))
498                 GOTO(out, err);
499
500         if (policy != NULL)
501                 *policy = lock->l_policy_data;
502
503         if (lock->l_completion_ast)
504                 lock->l_completion_ast(lock, *flags, NULL);
505
506         LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
507         EXIT;
508  out:
509         LDLM_LOCK_RELEASE(lock);
510  out_nolock:
511         return err;
512 }
513 EXPORT_SYMBOL(ldlm_cli_enqueue_local);
514
515 static void failed_lock_cleanup(struct ldlm_namespace *ns,
516                                 struct ldlm_lock *lock, int mode)
517 {
518         int need_cancel = 0;
519
520         /* Set a flag to prevent us from sending a CANCEL (bug 407) */
521         lock_res_and_lock(lock);
522         /* Check that lock is not granted or failed, we might race. */
523         if (!ldlm_is_granted(lock) && !ldlm_is_failed(lock)) {
524                 /* Make sure that this lock will not be found by raced
525                  * bl_ast and -EINVAL reply is sent to server anyways.
526                  * b=17645*/
527                 lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
528                                  LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
529                 need_cancel = 1;
530         }
531         unlock_res_and_lock(lock);
532
533         if (need_cancel)
534                 LDLM_DEBUG(lock,
535                            "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | "
536                            "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
537         else
538                 LDLM_DEBUG(lock, "lock was granted or failed in race");
539
540         /* XXX - HACK because we shouldn't call ldlm_lock_destroy()
541          *       from llite/file.c/ll_file_flock(). */
542         /* This code makes for the fact that we do not have blocking handler on
543          * a client for flock locks. As such this is the place where we must
544          * completely kill failed locks. (interrupted and those that
545          * were waiting to be granted when server evicted us. */
546         if (lock->l_resource->lr_type == LDLM_FLOCK) {
547                 lock_res_and_lock(lock);
548                 if (!ldlm_is_destroyed(lock)) {
549                         ldlm_resource_unlink_lock(lock);
550                         ldlm_lock_decref_internal_nolock(lock, mode);
551                         ldlm_lock_destroy_nolock(lock);
552                 }
553                 unlock_res_and_lock(lock);
554         } else {
555                 ldlm_lock_decref_internal(lock, mode);
556         }
557 }
558
559 /**
560  * Finishing portion of client lock enqueue code.
561  *
562  * Called after receiving reply from server.
563  */
564 int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
565                           enum ldlm_type type, __u8 with_policy,
566                           enum ldlm_mode mode, __u64 *flags, void *lvb,
567                           __u32 lvb_len, const struct lustre_handle *lockh,
568                           int rc)
569 {
570         struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
571         const struct lu_env *env = NULL;
572         int is_replay = *flags & LDLM_FL_REPLAY;
573         struct ldlm_lock *lock;
574         struct ldlm_reply *reply;
575         int cleanup_phase = 1;
576         ENTRY;
577
578         if (req && req->rq_svc_thread)
579                 env = req->rq_svc_thread->t_env;
580
581         lock = ldlm_handle2lock(lockh);
582         /* ldlm_cli_enqueue is holding a reference on this lock. */
583         if (!lock) {
584                 LASSERT(type == LDLM_FLOCK);
585                 RETURN(-ENOLCK);
586         }
587
588         LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len),
589                  "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len);
590
591         if (rc != ELDLM_OK) {
592                 LASSERT(!is_replay);
593                 LDLM_DEBUG(lock, "client-side enqueue END (%s)",
594                            rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
595
596                 if (rc != ELDLM_LOCK_ABORTED)
597                         GOTO(cleanup, rc);
598         }
599
600         /* Before we return, swab the reply */
601         reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
602         if (reply == NULL)
603                 GOTO(cleanup, rc = -EPROTO);
604
605         if (lvb_len > 0) {
606                 int size = 0;
607
608                 size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB,
609                                             RCL_SERVER);
610                 if (size < 0) {
611                         LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size);
612                         GOTO(cleanup, rc = size);
613                 } else if (unlikely(size > lvb_len)) {
614                         LDLM_ERROR(lock, "Replied LVB is larger than "
615                                    "expectation, expected = %d, replied = %d",
616                                    lvb_len, size);
617                         GOTO(cleanup, rc = -EINVAL);
618                 }
619                 lvb_len = size;
620         }
621
622         if (rc == ELDLM_LOCK_ABORTED) {
623                 if (lvb_len > 0 && lvb != NULL)
624                         rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
625                                            lvb, lvb_len);
626                 GOTO(cleanup, rc = rc ? : ELDLM_LOCK_ABORTED);
627         }
628
629         /* lock enqueued on the server */
630         cleanup_phase = 0;
631
632         lock_res_and_lock(lock);
633         /* Key change rehash lock in per-export hash with new key */
634         if (exp->exp_lock_hash) {
635                 /* In the function below, .hs_keycmp resolves to
636                  * ldlm_export_lock_keycmp() */
637                 /* coverity[overrun-buffer-val] */
638                 cfs_hash_rehash_key(exp->exp_lock_hash,
639                                     &lock->l_remote_handle,
640                                     &reply->lock_handle,
641                                     &lock->l_exp_hash);
642         } else {
643                 lock->l_remote_handle = reply->lock_handle;
644         }
645
646         *flags = ldlm_flags_from_wire(reply->lock_flags);
647         lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags &
648                                               LDLM_FL_INHERIT_MASK);
649         unlock_res_and_lock(lock);
650
651         CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: %#llx\n",
652                lock, reply->lock_handle.cookie, *flags);
653
654         /* If enqueue returned a blocked lock but the completion handler has
655          * already run, then it fixed up the resource and we don't need to do it
656          * again. */
657         if ((*flags) & LDLM_FL_LOCK_CHANGED) {
658                 int newmode = reply->lock_desc.l_req_mode;
659                 LASSERT(!is_replay);
660                 if (newmode && newmode != lock->l_req_mode) {
661                         LDLM_DEBUG(lock, "server returned different mode %s",
662                                    ldlm_lockname[newmode]);
663                         lock->l_req_mode = newmode;
664                 }
665
666                 if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name,
667                                  &lock->l_resource->lr_name)) {
668                         CDEBUG(D_INFO, "remote intent success, locking "DLDLMRES
669                                        " instead of "DLDLMRES"\n",
670                                PLDLMRES(&reply->lock_desc.l_resource),
671                                PLDLMRES(lock->l_resource));
672
673                         rc = ldlm_lock_change_resource(ns, lock,
674                                         &reply->lock_desc.l_resource.lr_name);
675                         if (rc || lock->l_resource == NULL)
676                                 GOTO(cleanup, rc = -ENOMEM);
677                         LDLM_DEBUG(lock, "client-side enqueue, new resource");
678                 }
679
680                 if (with_policy) {
681                         /* We assume lock type cannot change on server*/
682                         ldlm_convert_policy_to_local(exp,
683                                                 lock->l_resource->lr_type,
684                                                 &reply->lock_desc.l_policy_data,
685                                                 &lock->l_policy_data);
686                 }
687
688                 if (type != LDLM_PLAIN)
689                         LDLM_DEBUG(lock,"client-side enqueue, new policy data");
690         }
691
692         if ((*flags) & LDLM_FL_AST_SENT) {
693                 lock_res_and_lock(lock);
694                 lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST;
695                 unlock_res_and_lock(lock);
696                 LDLM_DEBUG(lock, "enqueue reply includes blocking AST");
697         }
698
699         /* If the lock has already been granted by a completion AST, don't
700          * clobber the LVB with an older one. */
701         if (lvb_len > 0) {
702                 /* We must lock or a racing completion might update lvb without
703                  * letting us know and we'll clobber the correct value.
704                  * Cannot unlock after the check either, a that still leaves
705                  * a tiny window for completion to get in */
706                 lock_res_and_lock(lock);
707                 if (!ldlm_is_granted(lock))
708                         rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER,
709                                            lock->l_lvb_data, lvb_len);
710                 unlock_res_and_lock(lock);
711                 if (rc < 0) {
712                         cleanup_phase = 1;
713                         GOTO(cleanup, rc);
714                 }
715         }
716
717         if (!is_replay) {
718                 rc = ldlm_lock_enqueue(env, ns, &lock, NULL, flags);
719                 if (lock->l_completion_ast != NULL) {
720                         int err = lock->l_completion_ast(lock, *flags, NULL);
721                         if (!rc)
722                                 rc = err;
723                         if (rc)
724                                 cleanup_phase = 1;
725                 }
726         }
727
728         if (lvb_len > 0 && lvb != NULL) {
729                 /* Copy the LVB here, and not earlier, because the completion
730                  * AST (if any) can override what we got in the reply */
731                 memcpy(lvb, lock->l_lvb_data, lvb_len);
732         }
733
734         LDLM_DEBUG(lock, "client-side enqueue END");
735         EXIT;
736 cleanup:
737         if (cleanup_phase == 1 && rc)
738                 failed_lock_cleanup(ns, lock, mode);
739         /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
740         LDLM_LOCK_PUT(lock);
741         LDLM_LOCK_RELEASE(lock);
742         return rc;
743 }
744 EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
745
746 /**
747  * Estimate number of lock handles that would fit into request of given
748  * size.  PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
749  * a single page on the send/receive side. XXX: 512 should be changed to
750  * more adequate value.
751  */
752 static inline int ldlm_req_handles_avail(int req_size, int off)
753 {
754         int avail;
755
756         avail = min_t(int, LDLM_MAXREQSIZE, PAGE_SIZE - 512) - req_size;
757         if (likely(avail >= 0))
758                 avail /= (int)sizeof(struct lustre_handle);
759         else
760                 avail = 0;
761         avail += LDLM_LOCKREQ_HANDLES - off;
762
763         return avail;
764 }
765
766 static inline int ldlm_capsule_handles_avail(struct req_capsule *pill,
767                                              enum req_location loc,
768                                              int off)
769 {
770         __u32 size = req_capsule_msg_size(pill, loc);
771         return ldlm_req_handles_avail(size, off);
772 }
773
774 static inline int ldlm_format_handles_avail(struct obd_import *imp,
775                                             const struct req_format *fmt,
776                                             enum req_location loc, int off)
777 {
778         __u32 size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc);
779         return ldlm_req_handles_avail(size, off);
780 }
781
782 /**
783  * Cancel LRU locks and pack them into the enqueue request. Pack there the given
784  * \a count locks in \a cancels.
785  *
786  * This is to be called by functions preparing their own requests that
787  * might contain lists of locks to cancel in addition to actual operation
788  * that needs to be performed.
789  */
790 int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
791                       int version, int opc, int canceloff,
792                       struct list_head *cancels, int count)
793         {
794         struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
795         struct req_capsule      *pill = &req->rq_pill;
796         struct ldlm_request     *dlm = NULL;
797         struct list_head        head = LIST_HEAD_INIT(head);
798         enum ldlm_lru_flags lru_flags;
799         int avail, to_free, pack = 0;
800         int rc;
801         ENTRY;
802
803         if (cancels == NULL)
804                 cancels = &head;
805         if (ns_connect_cancelset(ns)) {
806                 /* Estimate the amount of available space in the request. */
807                 req_capsule_filled_sizes(pill, RCL_CLIENT);
808                 avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
809
810                 lru_flags = LDLM_LRU_FLAG_NO_WAIT | (ns_connect_lru_resize(ns) ?
811                         LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED);
812                 to_free = !ns_connect_lru_resize(ns) &&
813                         opc == LDLM_ENQUEUE ? 1 : 0;
814
815                 /* Cancel LRU locks here _only_ if the server supports
816                  * EARLY_CANCEL. Otherwise we have to send extra CANCEL
817                  * RPC, which will make us slower. */
818                 if (avail > count)
819                         count += ldlm_cancel_lru_local(ns, cancels, to_free,
820                                                        avail - count, 0,
821                                                        lru_flags);
822                 if (avail > count)
823                         pack = count;
824                 else
825                         pack = avail;
826                 req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT,
827                                      ldlm_request_bufsize(pack, opc));
828         }
829
830         rc = ptlrpc_request_pack(req, version, opc);
831         if (rc) {
832                 ldlm_lock_list_put(cancels, l_bl_ast, count);
833                 RETURN(rc);
834         }
835
836         if (ns_connect_cancelset(ns)) {
837                 if (canceloff) {
838                         dlm = req_capsule_client_get(pill, &RMF_DLM_REQ);
839                         LASSERT(dlm);
840                         /* Skip first lock handler in ldlm_request_pack(),
841                          * this method will increment @lock_count according
842                          * to the lock handle amount actually written to
843                          * the buffer. */
844                         dlm->lock_count = canceloff;
845                 }
846                 /* Pack into the request @pack lock handles. */
847                 ldlm_cli_cancel_list(cancels, pack, req, 0);
848                 /* Prepare and send separate cancel RPC for others. */
849                 ldlm_cli_cancel_list(cancels, count - pack, NULL, 0);
850         } else {
851                 ldlm_lock_list_put(cancels, l_bl_ast, count);
852         }
853         RETURN(0);
854 }
855 EXPORT_SYMBOL(ldlm_prep_elc_req);
856
857 int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
858                           struct list_head *cancels, int count)
859 {
860         return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
861                                  LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
862 }
863 EXPORT_SYMBOL(ldlm_prep_enqueue_req);
864
865 struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len)
866 {
867         struct ptlrpc_request *req;
868         int rc;
869         ENTRY;
870
871         req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE);
872         if (req == NULL)
873                 RETURN(ERR_PTR(-ENOMEM));
874
875         rc = ldlm_prep_enqueue_req(exp, req, NULL, 0);
876         if (rc) {
877                 ptlrpc_request_free(req);
878                 RETURN(ERR_PTR(rc));
879         }
880
881         req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len);
882         ptlrpc_request_set_replen(req);
883         RETURN(req);
884 }
885 EXPORT_SYMBOL(ldlm_enqueue_pack);
886
887 /**
888  * Client-side lock enqueue.
889  *
890  * If a request has some specific initialisation it is passed in \a reqp,
891  * otherwise it is created in ldlm_cli_enqueue.
892  *
893  * Supports sync and async requests, pass \a async flag accordingly. If a
894  * request was created in ldlm_cli_enqueue and it is the async request,
895  * pass it to the caller in \a reqp.
896  */
897 int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
898                      struct ldlm_enqueue_info *einfo,
899                      const struct ldlm_res_id *res_id,
900                      union ldlm_policy_data const *policy, __u64 *flags,
901                      void *lvb, __u32 lvb_len, enum lvb_type lvb_type,
902                      struct lustre_handle *lockh, int async)
903 {
904         struct ldlm_namespace *ns;
905         struct ldlm_lock      *lock;
906         struct ldlm_request   *body;
907         int                    is_replay = *flags & LDLM_FL_REPLAY;
908         int                    req_passed_in = 1;
909         int                    rc, err;
910         struct ptlrpc_request *req;
911         ENTRY;
912
913         LASSERT(exp != NULL);
914
915         ns = exp->exp_obd->obd_namespace;
916
917         /* If we're replaying this lock, just check some invariants.
918          * If we're creating a new lock, get everything all setup nice. */
919         if (is_replay) {
920                 lock = ldlm_handle2lock_long(lockh, 0);
921                 LASSERT(lock != NULL);
922                 LDLM_DEBUG(lock, "client-side enqueue START");
923                 LASSERT(exp == lock->l_conn_export);
924         } else {
925                 const struct ldlm_callback_suite cbs = {
926                         .lcs_completion = einfo->ei_cb_cp,
927                         .lcs_blocking   = einfo->ei_cb_bl,
928                         .lcs_glimpse    = einfo->ei_cb_gl
929                 };
930                 lock = ldlm_lock_create(ns, res_id, einfo->ei_type,
931                                         einfo->ei_mode, &cbs, einfo->ei_cbdata,
932                                         lvb_len, lvb_type);
933                 if (IS_ERR(lock))
934                         RETURN(PTR_ERR(lock));
935
936                 if (einfo->ei_cb_created)
937                         einfo->ei_cb_created(lock);
938
939                 /* for the local lock, add the reference */
940                 ldlm_lock_addref_internal(lock, einfo->ei_mode);
941                 ldlm_lock2handle(lock, lockh);
942                 if (policy != NULL)
943                         lock->l_policy_data = *policy;
944
945                 if (einfo->ei_type == LDLM_EXTENT) {
946                         /* extent lock without policy is a bug */
947                         if (policy == NULL)
948                                 LBUG();
949
950                         lock->l_req_extent = policy->l_extent;
951                 }
952                 LDLM_DEBUG(lock, "client-side enqueue START, flags %#llx",
953                            *flags);
954         }
955
956         lock->l_conn_export = exp;
957         lock->l_export = NULL;
958         lock->l_blocking_ast = einfo->ei_cb_bl;
959         lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL));
960         lock->l_activity = ktime_get_real_seconds();
961
962         /* lock not sent to server yet */
963         if (reqp == NULL || *reqp == NULL) {
964                 req = ldlm_enqueue_pack(exp, lvb_len);
965                 if (IS_ERR(req)) {
966                         failed_lock_cleanup(ns, lock, einfo->ei_mode);
967                         LDLM_LOCK_RELEASE(lock);
968                         RETURN(PTR_ERR(req));
969                 }
970
971                 req_passed_in = 0;
972                 if (reqp)
973                         *reqp = req;
974         } else {
975                 int len;
976
977                 req = *reqp;
978                 len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ,
979                                            RCL_CLIENT);
980                 LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n",
981                          DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
982         }
983
984         if (*flags & LDLM_FL_NDELAY) {
985                 DEBUG_REQ(D_DLMTRACE, req, "enque lock with no delay\n");
986                 req->rq_no_resend = req->rq_no_delay = 1;
987                 /* probably set a shorter timeout value and handle ETIMEDOUT
988                  * in osc_lock_upcall() correctly */
989                 /* lustre_msg_set_timeout(req, req->rq_timeout / 2); */
990         }
991
992         /* Dump lock data into the request buffer */
993         body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
994         ldlm_lock2desc(lock, &body->lock_desc);
995         body->lock_flags = ldlm_flags_to_wire(*flags);
996         body->lock_handle[0] = *lockh;
997
998         /* extended LDLM opcodes in client stats */
999         if (exp->exp_obd->obd_svc_stats != NULL) {
1000                 bool glimpse = *flags & LDLM_FL_HAS_INTENT;
1001
1002                 /* OST glimpse has no intent buffer */
1003                 if (req_capsule_has_field(&req->rq_pill, &RMF_LDLM_INTENT,
1004                                           RCL_CLIENT)) {
1005                         struct ldlm_intent *it;
1006
1007                         it = req_capsule_client_get(&req->rq_pill,
1008                                                     &RMF_LDLM_INTENT);
1009                         glimpse = (it && (it->opc == IT_GLIMPSE));
1010                 }
1011
1012                 if (!glimpse)
1013                         ldlm_svc_get_eopc(body, exp->exp_obd->obd_svc_stats);
1014                 else
1015                         lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
1016                                              PTLRPC_LAST_CNTR +
1017                                              LDLM_GLIMPSE_ENQUEUE);
1018         }
1019
1020         if (async) {
1021                 LASSERT(reqp != NULL);
1022                 RETURN(0);
1023         }
1024
1025         LDLM_DEBUG(lock, "sending request");
1026
1027         rc = ptlrpc_queue_wait(req);
1028
1029         err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
1030                                     einfo->ei_mode, flags, lvb, lvb_len,
1031                                     lockh, rc);
1032
1033         /* If ldlm_cli_enqueue_fini did not find the lock, we need to free
1034          * one reference that we took */
1035         if (err == -ENOLCK)
1036                 LDLM_LOCK_RELEASE(lock);
1037         else
1038                 rc = err;
1039
1040         if (!req_passed_in && req != NULL) {
1041                 ptlrpc_req_finished(req);
1042                 if (reqp)
1043                         *reqp = NULL;
1044         }
1045
1046         RETURN(rc);
1047 }
1048 EXPORT_SYMBOL(ldlm_cli_enqueue);
1049
1050 /**
1051  * Client-side lock convert reply handling.
1052  *
1053  * Finish client lock converting, checks for concurrent converts
1054  * and clear 'converting' flag so lock can be placed back into LRU.
1055  */
1056 static int lock_convert_interpret(const struct lu_env *env,
1057                                   struct ptlrpc_request *req,
1058                                   void *args, int rc)
1059 {
1060         struct ldlm_async_args *aa = args;
1061         struct ldlm_lock *lock;
1062         struct ldlm_reply *reply;
1063
1064         ENTRY;
1065
1066         lock = ldlm_handle2lock(&aa->lock_handle);
1067         if (!lock) {
1068                 LDLM_DEBUG_NOLOCK("convert ACK for unknown local cookie %#llx",
1069                         aa->lock_handle.cookie);
1070                 RETURN(-ESTALE);
1071         }
1072
1073         LDLM_DEBUG(lock, "CONVERTED lock:");
1074
1075         if (rc != ELDLM_OK)
1076                 GOTO(out, rc);
1077
1078         reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
1079         if (reply == NULL)
1080                 GOTO(out, rc = -EPROTO);
1081
1082         if (reply->lock_handle.cookie != aa->lock_handle.cookie) {
1083                 LDLM_ERROR(lock, "convert ACK with wrong lock cookie %#llx"
1084                            " but cookie %#llx from server %s id %s\n",
1085                            aa->lock_handle.cookie, reply->lock_handle.cookie,
1086                            req->rq_export->exp_client_uuid.uuid,
1087                            libcfs_id2str(req->rq_peer));
1088                 GOTO(out, rc = ELDLM_NO_LOCK_DATA);
1089         }
1090
1091         lock_res_and_lock(lock);
1092         /* Lock convert is sent for any new bits to drop, the converting flag
1093          * is dropped when ibits on server are the same as on client. Meanwhile
1094          * that can be so that more later convert will be replied first with
1095          * and clear converting flag, so in case of such race just exit here.
1096          * if lock has no converting bits then  */
1097         if (!ldlm_is_converting(lock)) {
1098                 LDLM_DEBUG(lock, "convert ACK for lock without converting flag,"
1099                            " reply ibits %#llx",
1100                            reply->lock_desc.l_policy_data.l_inodebits.bits);
1101         } else if (reply->lock_desc.l_policy_data.l_inodebits.bits !=
1102                    lock->l_policy_data.l_inodebits.bits) {
1103                 /* Compare server returned lock ibits and local lock ibits
1104                  * if they are the same we consider convertion is done,
1105                  * otherwise we have more converts inflight and keep
1106                  * converting flag.
1107                  */
1108                 LDLM_DEBUG(lock, "convert ACK with ibits %#llx\n",
1109                            reply->lock_desc.l_policy_data.l_inodebits.bits);
1110         } else {
1111                 ldlm_clear_converting(lock);
1112
1113                 /* Concurrent BL AST may arrive and cause another convert
1114                  * or cancel so just do nothing here if bl_ast is set,
1115                  * finish with convert otherwise.
1116                  */
1117                 if (!ldlm_is_bl_ast(lock)) {
1118                         struct ldlm_namespace *ns = ldlm_lock_to_ns(lock);
1119
1120                         /* Drop cancel_bits since there are no more converts
1121                          * and put lock into LRU if it is still not used and
1122                          * is not there yet.
1123                          */
1124                         lock->l_policy_data.l_inodebits.cancel_bits = 0;
1125                         if (!lock->l_readers && !lock->l_writers &&
1126                             !ldlm_is_canceling(lock)) {
1127                                 spin_lock(&ns->ns_lock);
1128                                 /* there is check for list_empty() inside */
1129                                 ldlm_lock_remove_from_lru_nolock(lock);
1130                                 ldlm_lock_add_to_lru_nolock(lock);
1131                                 spin_unlock(&ns->ns_lock);
1132                         }
1133                 }
1134         }
1135         unlock_res_and_lock(lock);
1136 out:
1137         if (rc) {
1138                 int flag;
1139
1140                 lock_res_and_lock(lock);
1141                 if (ldlm_is_converting(lock)) {
1142                         ldlm_clear_converting(lock);
1143                         ldlm_set_cbpending(lock);
1144                         ldlm_set_bl_ast(lock);
1145                         lock->l_policy_data.l_inodebits.cancel_bits = 0;
1146                 }
1147                 unlock_res_and_lock(lock);
1148
1149                 /* fallback to normal lock cancel. If rc means there is no
1150                  * valid lock on server, do only local cancel */
1151                 if (rc == ELDLM_NO_LOCK_DATA)
1152                         flag = LCF_LOCAL;
1153                 else
1154                         flag = LCF_ASYNC;
1155
1156                 rc = ldlm_cli_cancel(&aa->lock_handle, flag);
1157                 if (rc < 0)
1158                         LDLM_DEBUG(lock, "failed to cancel lock: rc = %d\n",
1159                                    rc);
1160         }
1161         LDLM_LOCK_PUT(lock);
1162         RETURN(rc);
1163 }
1164
1165 /**
1166  * Client-side IBITS lock convert.
1167  *
1168  * Inform server that lock has been converted instead of canceling.
1169  * Server finishes convert on own side and does reprocess to grant
1170  * all related waiting locks.
1171  *
1172  * Since convert means only ibits downgrading, client doesn't need to
1173  * wait for server reply to finish local converting process so this request
1174  * is made asynchronous.
1175  *
1176  */
1177 int ldlm_cli_convert(struct ldlm_lock *lock, __u32 *flags)
1178 {
1179         struct ldlm_request *body;
1180         struct ptlrpc_request *req;
1181         struct ldlm_async_args *aa;
1182         struct obd_export *exp = lock->l_conn_export;
1183
1184         ENTRY;
1185
1186         if (exp == NULL) {
1187                 LDLM_ERROR(lock, "convert must not be called on local locks.");
1188                 RETURN(-EINVAL);
1189         }
1190
1191         /* this is better to check earlier and it is done so already,
1192          * but this check is kept too as final one to issue an error
1193          * if any new code will miss such check.
1194          */
1195         if (!exp_connect_lock_convert(exp)) {
1196                 LDLM_ERROR(lock, "server doesn't support lock convert\n");
1197                 RETURN(-EPROTO);
1198         }
1199
1200         if (lock->l_resource->lr_type != LDLM_IBITS) {
1201                 LDLM_ERROR(lock, "convert works with IBITS locks only.");
1202                 RETURN(-EINVAL);
1203         }
1204
1205         LDLM_DEBUG(lock, "client-side convert");
1206
1207         req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp),
1208                                         &RQF_LDLM_CONVERT, LUSTRE_DLM_VERSION,
1209                                         LDLM_CONVERT);
1210         if (req == NULL)
1211                 RETURN(-ENOMEM);
1212
1213         body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
1214         body->lock_handle[0] = lock->l_remote_handle;
1215
1216         body->lock_desc.l_req_mode = lock->l_req_mode;
1217         body->lock_desc.l_granted_mode = lock->l_granted_mode;
1218
1219         body->lock_desc.l_policy_data.l_inodebits.bits =
1220                                         lock->l_policy_data.l_inodebits.bits;
1221         body->lock_desc.l_policy_data.l_inodebits.cancel_bits = 0;
1222
1223         body->lock_flags = ldlm_flags_to_wire(*flags);
1224         body->lock_count = 1;
1225
1226         ptlrpc_request_set_replen(req);
1227
1228         /*
1229          * Use cancel portals for convert as well as high-priority handling.
1230          */
1231         req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
1232         req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
1233
1234         ptlrpc_at_set_req_timeout(req);
1235
1236         if (exp->exp_obd->obd_svc_stats != NULL)
1237                 lprocfs_counter_incr(exp->exp_obd->obd_svc_stats,
1238                                      LDLM_CONVERT - LDLM_FIRST_OPC);
1239
1240         aa = ptlrpc_req_async_args(req);
1241         ldlm_lock2handle(lock, &aa->lock_handle);
1242         req->rq_interpret_reply = lock_convert_interpret;
1243
1244         ptlrpcd_add_req(req);
1245         RETURN(0);
1246 }
1247
1248 /**
1249  * Cancel locks locally.
1250  * Returns:
1251  * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server
1252  * \retval LDLM_FL_CANCELING otherwise;
1253  * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC.
1254  */
1255 static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock)
1256 {
1257         __u64 rc = LDLM_FL_LOCAL_ONLY;
1258         ENTRY;
1259
1260         if (lock->l_conn_export) {
1261                 bool local_only;
1262
1263                 LDLM_DEBUG(lock, "client-side cancel");
1264                 /* Set this flag to prevent others from getting new references*/
1265                 lock_res_and_lock(lock);
1266                 ldlm_set_cbpending(lock);
1267                 local_only = !!(lock->l_flags &
1268                                 (LDLM_FL_LOCAL_ONLY|LDLM_FL_CANCEL_ON_BLOCK));
1269                 ldlm_cancel_callback(lock);
1270                 rc = (ldlm_is_bl_ast(lock)) ?
1271                         LDLM_FL_BL_AST : LDLM_FL_CANCELING;
1272                 unlock_res_and_lock(lock);
1273
1274                 if (local_only) {
1275                         CDEBUG(D_DLMTRACE, "not sending request (at caller's "
1276                                "instruction)\n");
1277                         rc = LDLM_FL_LOCAL_ONLY;
1278                 }
1279                 ldlm_lock_cancel(lock);
1280         } else {
1281                 if (ns_is_client(ldlm_lock_to_ns(lock))) {
1282                         LDLM_ERROR(lock, "Trying to cancel local lock");
1283                         LBUG();
1284                 }
1285                 LDLM_DEBUG(lock, "server-side local cancel");
1286                 ldlm_lock_cancel(lock);
1287                 ldlm_reprocess_all(lock->l_resource);
1288         }
1289
1290         RETURN(rc);
1291 }
1292
1293 /**
1294  * Pack \a count locks in \a head into ldlm_request buffer of request \a req.
1295  */
1296 static void ldlm_cancel_pack(struct ptlrpc_request *req,
1297                              struct list_head *head, int count)
1298 {
1299         struct ldlm_request *dlm;
1300         struct ldlm_lock *lock;
1301         int max, packed = 0;
1302         ENTRY;
1303
1304         dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
1305         LASSERT(dlm != NULL);
1306
1307         /* Check the room in the request buffer. */
1308         max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) -
1309                 sizeof(struct ldlm_request);
1310         max /= sizeof(struct lustre_handle);
1311         max += LDLM_LOCKREQ_HANDLES;
1312         LASSERT(max >= dlm->lock_count + count);
1313
1314         /* XXX: it would be better to pack lock handles grouped by resource.
1315          * so that the server cancel would call filter_lvbo_update() less
1316          * frequently. */
1317         list_for_each_entry(lock, head, l_bl_ast) {
1318                 if (!count--)
1319                         break;
1320                 LASSERT(lock->l_conn_export);
1321                 /* Pack the lock handle to the given request buffer. */
1322                 LDLM_DEBUG(lock, "packing");
1323                 dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle;
1324                 packed++;
1325         }
1326         CDEBUG(D_DLMTRACE, "%d locks packed\n", packed);
1327         EXIT;
1328 }
1329
1330 /**
1331  * Prepare and send a batched cancel RPC. It will include \a count lock
1332  * handles of locks given in \a cancels list. */
1333 int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
1334                         int count, enum ldlm_cancel_flags flags)
1335 {
1336         struct ptlrpc_request *req = NULL;
1337         struct obd_import *imp;
1338         int free, sent = 0;
1339         int rc = 0;
1340         ENTRY;
1341
1342         LASSERT(exp != NULL);
1343         LASSERT(count > 0);
1344
1345         CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
1346
1347         if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
1348                 RETURN(count);
1349
1350         free = ldlm_format_handles_avail(class_exp2cliimp(exp),
1351                                          &RQF_LDLM_CANCEL, RCL_CLIENT, 0);
1352         if (count > free)
1353                 count = free;
1354
1355         while (1) {
1356                 imp = class_exp2cliimp(exp);
1357                 if (imp == NULL || imp->imp_invalid) {
1358                         CDEBUG(D_DLMTRACE,
1359                                "skipping cancel on invalid import %p\n", imp);
1360                         RETURN(count);
1361                 }
1362
1363                 req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL);
1364                 if (req == NULL)
1365                         GOTO(out, rc = -ENOMEM);
1366
1367                 req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
1368                 req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
1369                                      ldlm_request_bufsize(count, LDLM_CANCEL));
1370
1371                 rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL);
1372                 if (rc) {
1373                         ptlrpc_request_free(req);
1374                         GOTO(out, rc);
1375                 }
1376
1377                 /* If OSP want cancel cross-MDT lock, let's not block it in
1378                  * in recovery, otherwise the lock will not released, if
1379                  * the remote target is also in recovery, and it also need
1380                  * this lock, it might cause deadlock. */
1381                 if (exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS &&
1382                     exp->exp_obd->obd_lu_dev != NULL &&
1383                     exp->exp_obd->obd_lu_dev->ld_site != NULL) {
1384                         struct lu_device *top_dev;
1385
1386                         top_dev = exp->exp_obd->obd_lu_dev->ld_site->ls_top_dev;
1387                         if (top_dev != NULL &&
1388                             top_dev->ld_obd->obd_recovering)
1389                                 req->rq_allow_replay = 1;
1390                 }
1391
1392                 req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
1393                 req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
1394                 ptlrpc_at_set_req_timeout(req);
1395
1396                 ldlm_cancel_pack(req, cancels, count);
1397
1398                 ptlrpc_request_set_replen(req);
1399                 if (flags & LCF_ASYNC) {
1400                         ptlrpcd_add_req(req);
1401                         sent = count;
1402                         GOTO(out, 0);
1403                 }
1404
1405                 rc = ptlrpc_queue_wait(req);
1406                 if (rc == LUSTRE_ESTALE) {
1407                         CDEBUG(D_DLMTRACE, "client/server (nid %s) "
1408                                "out of sync -- not fatal\n",
1409                                libcfs_nid2str(req->rq_import->
1410                                               imp_connection->c_peer.nid));
1411                         rc = 0;
1412                 } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/
1413                            req->rq_import_generation == imp->imp_generation) {
1414                         ptlrpc_req_finished(req);
1415                         continue;
1416                 } else if (rc != ELDLM_OK) {
1417                         /* -ESHUTDOWN is common on umount */
1418                         CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
1419                                      "Got rc %d from cancel RPC: "
1420                                      "canceling anyway\n", rc);
1421                         break;
1422                 }
1423                 sent = count;
1424                 break;
1425         }
1426
1427         ptlrpc_req_finished(req);
1428         EXIT;
1429 out:
1430         return sent ? sent : rc;
1431 }
1432
1433 static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
1434 {
1435         LASSERT(imp != NULL);
1436         return &imp->imp_obd->obd_namespace->ns_pool;
1437 }
1438
1439 /**
1440  * Update client's OBD pool related fields with new SLV and Limit from \a req.
1441  */
1442 int ldlm_cli_update_pool(struct ptlrpc_request *req)
1443 {
1444         struct obd_device *obd;
1445         __u64 new_slv;
1446         __u32 new_limit;
1447         ENTRY;
1448         if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
1449                      !imp_connect_lru_resize(req->rq_import)))
1450         {
1451                 /*
1452                  * Do nothing for corner cases.
1453                  */
1454                 RETURN(0);
1455         }
1456
1457         /* In some cases RPC may contain SLV and limit zeroed out. This
1458          * is the case when server does not support LRU resize feature.
1459          * This is also possible in some recovery cases when server-side
1460          * reqs have no reference to the OBD export and thus access to
1461          * server-side namespace is not possible. */
1462         if (lustre_msg_get_slv(req->rq_repmsg) == 0 ||
1463             lustre_msg_get_limit(req->rq_repmsg) == 0) {
1464                 DEBUG_REQ(D_HA, req, "Zero SLV or Limit found "
1465                           "(SLV: %llu, Limit: %u)",
1466                           lustre_msg_get_slv(req->rq_repmsg),
1467                           lustre_msg_get_limit(req->rq_repmsg));
1468                 RETURN(0);
1469         }
1470
1471         new_limit = lustre_msg_get_limit(req->rq_repmsg);
1472         new_slv = lustre_msg_get_slv(req->rq_repmsg);
1473         obd = req->rq_import->imp_obd;
1474
1475         /* Set new SLV and limit in OBD fields to make them accessible
1476          * to the pool thread. We do not access obd_namespace and pool
1477          * directly here as there is no reliable way to make sure that
1478          * they are still alive at cleanup time. Evil races are possible
1479          * which may cause Oops at that time. */
1480         write_lock(&obd->obd_pool_lock);
1481         obd->obd_pool_slv = new_slv;
1482         obd->obd_pool_limit = new_limit;
1483         write_unlock(&obd->obd_pool_lock);
1484
1485         RETURN(0);
1486 }
1487
1488 /**
1489  * Client side lock cancel.
1490  *
1491  * Lock must not have any readers or writers by this time.
1492  */
1493 int ldlm_cli_cancel(const struct lustre_handle *lockh,
1494                     enum ldlm_cancel_flags cancel_flags)
1495 {
1496         struct obd_export *exp;
1497         enum ldlm_lru_flags lru_flags;
1498         int avail, count = 1;
1499         __u64 rc = 0;
1500         struct ldlm_namespace *ns;
1501         struct ldlm_lock *lock;
1502         struct list_head cancels = LIST_HEAD_INIT(cancels);
1503
1504         ENTRY;
1505
1506         lock = ldlm_handle2lock_long(lockh, 0);
1507         if (lock == NULL) {
1508                 LDLM_DEBUG_NOLOCK("lock is already being destroyed");
1509                 RETURN(0);
1510         }
1511
1512         /* Convert lock bits instead of cancel for IBITS locks */
1513         if (cancel_flags & LCF_CONVERT) {
1514                 LASSERT(lock->l_resource->lr_type == LDLM_IBITS);
1515                 LASSERT(lock->l_policy_data.l_inodebits.cancel_bits != 0);
1516
1517                 rc = ldlm_cli_dropbits(lock,
1518                                 lock->l_policy_data.l_inodebits.cancel_bits);
1519                 if (rc == 0) {
1520                         LDLM_LOCK_RELEASE(lock);
1521                         RETURN(0);
1522                 }
1523         }
1524
1525         lock_res_and_lock(lock);
1526         /* Lock is being canceled and the caller doesn't want to wait */
1527         if (ldlm_is_canceling(lock)) {
1528                 if (cancel_flags & LCF_ASYNC) {
1529                         unlock_res_and_lock(lock);
1530                 } else {
1531                         struct l_wait_info lwi = { 0 };
1532
1533                         unlock_res_and_lock(lock);
1534                         l_wait_event(lock->l_waitq, is_bl_done(lock), &lwi);
1535                 }
1536                 LDLM_LOCK_RELEASE(lock);
1537                 RETURN(0);
1538         }
1539
1540         /* Lock is being converted, cancel it immediately.
1541          * When convert will end, it releases lock and it will be gone.
1542          */
1543         if (ldlm_is_converting(lock)) {
1544                 /* set back flags removed by convert */
1545                 ldlm_set_cbpending(lock);
1546                 ldlm_set_bl_ast(lock);
1547         }
1548
1549         ldlm_set_canceling(lock);
1550         unlock_res_and_lock(lock);
1551
1552         if (cancel_flags & LCF_LOCAL)
1553                 OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_LOCAL_CANCEL_PAUSE,
1554                                  cfs_fail_val);
1555
1556         rc = ldlm_cli_cancel_local(lock);
1557         if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) {
1558                 LDLM_LOCK_RELEASE(lock);
1559                 RETURN(0);
1560         }
1561         /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
1562          * RPC which goes to canceld portal, so we can cancel other LRU locks
1563          * here and send them all as one LDLM_CANCEL RPC. */
1564         LASSERT(list_empty(&lock->l_bl_ast));
1565         list_add(&lock->l_bl_ast, &cancels);
1566
1567         exp = lock->l_conn_export;
1568         if (exp_connect_cancelset(exp)) {
1569                 avail = ldlm_format_handles_avail(class_exp2cliimp(exp),
1570                                                   &RQF_LDLM_CANCEL,
1571                                                   RCL_CLIENT, 0);
1572                 LASSERT(avail > 0);
1573
1574                 ns = ldlm_lock_to_ns(lock);
1575                 lru_flags = ns_connect_lru_resize(ns) ?
1576                         LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED;
1577                 count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
1578                                                LCF_BL_AST, lru_flags);
1579         }
1580         ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags);
1581         RETURN(0);
1582 }
1583 EXPORT_SYMBOL(ldlm_cli_cancel);
1584
1585 /**
1586  * Locally cancel up to \a count locks in list \a cancels.
1587  * Return the number of cancelled locks.
1588  */
1589 int ldlm_cli_cancel_list_local(struct list_head *cancels, int count,
1590                                enum ldlm_cancel_flags cancel_flags)
1591 {
1592         struct list_head head = LIST_HEAD_INIT(head);
1593         struct ldlm_lock *lock, *next;
1594         int left = 0, bl_ast = 0;
1595         __u64 rc;
1596
1597         left = count;
1598         list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
1599                 if (left-- == 0)
1600                         break;
1601
1602                 if (cancel_flags & LCF_LOCAL) {
1603                         rc = LDLM_FL_LOCAL_ONLY;
1604                         ldlm_lock_cancel(lock);
1605                 } else {
1606                         rc = ldlm_cli_cancel_local(lock);
1607                 }
1608                 /* Until we have compound requests and can send LDLM_CANCEL
1609                  * requests batched with generic RPCs, we need to send cancels
1610                  * with the LDLM_FL_BL_AST flag in a separate RPC from
1611                  * the one being generated now. */
1612                 if (!(cancel_flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
1613                         LDLM_DEBUG(lock, "Cancel lock separately");
1614                         list_del_init(&lock->l_bl_ast);
1615                         list_add(&lock->l_bl_ast, &head);
1616                         bl_ast++;
1617                         continue;
1618                 }
1619                 if (rc == LDLM_FL_LOCAL_ONLY) {
1620                         /* CANCEL RPC should not be sent to server. */
1621                         list_del_init(&lock->l_bl_ast);
1622                         LDLM_LOCK_RELEASE(lock);
1623                         count--;
1624                 }
1625         }
1626         if (bl_ast > 0) {
1627                 count -= bl_ast;
1628                 ldlm_cli_cancel_list(&head, bl_ast, NULL, 0);
1629         }
1630
1631         RETURN(count);
1632 }
1633
1634 /**
1635  * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back
1636  * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g.
1637  * readahead requests, ...)
1638  */
1639 static enum ldlm_policy_res
1640 ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock,
1641                            int unused, int added, int count)
1642 {
1643         enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK;
1644
1645         /* don't check added & count since we want to process all locks
1646          * from unused list.
1647          * It's fine to not take lock to access lock->l_resource since
1648          * the lock has already been granted so it won't change. */
1649         switch (lock->l_resource->lr_type) {
1650                 case LDLM_EXTENT:
1651                 case LDLM_IBITS:
1652                         if (ns->ns_cancel != NULL && ns->ns_cancel(lock) != 0)
1653                                 break;
1654                 default:
1655                         result = LDLM_POLICY_SKIP_LOCK;
1656                         break;
1657         }
1658
1659         RETURN(result);
1660 }
1661
1662 /**
1663  * Callback function for LRU-resize policy. Decides whether to keep
1664  * \a lock in LRU for current \a LRU size \a unused, added in current
1665  * scan \a added and number of locks to be preferably canceled \a count.
1666  *
1667  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
1668  *
1669  * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
1670  */
1671 static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
1672                                                     struct ldlm_lock *lock,
1673                                                     int unused, int added,
1674                                                     int count)
1675 {
1676         ktime_t cur = ktime_get();
1677         struct ldlm_pool *pl = &ns->ns_pool;
1678         u64 slv, lvf, lv;
1679         s64 la;
1680
1681         /* Stop LRU processing when we reach past @count or have checked all
1682          * locks in LRU. */
1683         if (count && added >= count)
1684                 return LDLM_POLICY_KEEP_LOCK;
1685
1686         /* Despite of the LV, It doesn't make sense to keep the lock which
1687          * is unused for ns_max_age time.
1688          */
1689         if (ktime_after(ktime_get(),
1690                         ktime_add(lock->l_last_used, ns->ns_max_age)))
1691                 return LDLM_POLICY_CANCEL_LOCK;
1692
1693         slv = ldlm_pool_get_slv(pl);
1694         lvf = ldlm_pool_get_lvf(pl);
1695         la = div_u64(ktime_to_ns(ktime_sub(cur, lock->l_last_used)),
1696                      NSEC_PER_SEC);
1697         lv = lvf * la * unused;
1698
1699         /* Inform pool about current CLV to see it via debugfs. */
1700         ldlm_pool_set_clv(pl, lv);
1701
1702         /* Stop when SLV is not yet come from server or lv is smaller than
1703          * it is. */
1704         if (slv == 0 || lv < slv)
1705                 return LDLM_POLICY_KEEP_LOCK;
1706
1707         return LDLM_POLICY_CANCEL_LOCK;
1708 }
1709
1710 static enum ldlm_policy_res
1711 ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns,
1712                                 struct ldlm_lock *lock,
1713                                 int unused, int added,
1714                                 int count)
1715 {
1716         enum ldlm_policy_res result;
1717
1718         result = ldlm_cancel_lrur_policy(ns, lock, unused, added, count);
1719         if (result == LDLM_POLICY_KEEP_LOCK)
1720                 return result;
1721
1722         return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
1723 }
1724
1725 /**
1726  * Callback function for debugfs used policy. Makes decision whether to keep
1727  * \a lock in LRU for current \a LRU size \a unused, added in current scan \a
1728  * added and number of locks to be preferably canceled \a count.
1729  *
1730  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
1731  *
1732  * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
1733  */
1734 static enum ldlm_policy_res ldlm_cancel_passed_policy(struct ldlm_namespace *ns,
1735                                                       struct ldlm_lock *lock,
1736                                                       int unused, int added,
1737                                                       int count)
1738 {
1739         /* Stop LRU processing when we reach past @count or have checked all
1740          * locks in LRU. */
1741         return (added >= count) ?
1742                 LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
1743 }
1744
1745 /**
1746  * Callback function for aged policy. Makes decision whether to keep \a lock in
1747  * LRU for current LRU size \a unused, added in current scan \a added and
1748  * number of locks to be preferably canceled \a count.
1749  *
1750  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
1751  *
1752  * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
1753  */
1754 static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns,
1755                                                     struct ldlm_lock *lock,
1756                                                     int unused, int added,
1757                                                     int count)
1758 {
1759         if ((added >= count) &&
1760             ktime_before(ktime_get(),
1761                          ktime_add(lock->l_last_used, ns->ns_max_age)))
1762                 return LDLM_POLICY_KEEP_LOCK;
1763
1764         return LDLM_POLICY_CANCEL_LOCK;
1765 }
1766
1767 static enum ldlm_policy_res
1768 ldlm_cancel_aged_no_wait_policy(struct ldlm_namespace *ns,
1769                                 struct ldlm_lock *lock,
1770                                 int unused, int added, int count)
1771 {
1772         enum ldlm_policy_res result;
1773
1774         result = ldlm_cancel_aged_policy(ns, lock, unused, added, count);
1775         if (result == LDLM_POLICY_KEEP_LOCK)
1776                 return result;
1777
1778         return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count);
1779 }
1780
1781 /**
1782  * Callback function for default policy. Makes decision whether to keep \a lock
1783  * in LRU for current LRU size \a unused, added in current scan \a added and
1784  * number of locks to be preferably canceled \a count.
1785  *
1786  * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning
1787  *
1788  * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU
1789  */
1790 static
1791 enum ldlm_policy_res ldlm_cancel_default_policy(struct ldlm_namespace *ns,
1792                                                 struct ldlm_lock *lock,
1793                                                 int unused, int added,
1794                                                 int count)
1795 {
1796         /* Stop LRU processing when we reach past count or have checked all
1797          * locks in LRU. */
1798         return (added >= count) ?
1799                 LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
1800 }
1801
1802 typedef enum ldlm_policy_res
1803 (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *ns, struct ldlm_lock *lock,
1804                             int unused, int added, int count);
1805
1806 static ldlm_cancel_lru_policy_t
1807 ldlm_cancel_lru_policy(struct ldlm_namespace *ns, enum ldlm_lru_flags lru_flags)
1808 {
1809         if (ns_connect_lru_resize(ns)) {
1810                 if (lru_flags & LDLM_LRU_FLAG_SHRINK)
1811                         /* We kill passed number of old locks. */
1812                         return ldlm_cancel_passed_policy;
1813                 if (lru_flags & LDLM_LRU_FLAG_LRUR) {
1814                         if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
1815                                 return ldlm_cancel_lrur_no_wait_policy;
1816                         else
1817                                 return ldlm_cancel_lrur_policy;
1818                 }
1819                 if (lru_flags & LDLM_LRU_FLAG_PASSED)
1820                         return ldlm_cancel_passed_policy;
1821         } else {
1822                 if (lru_flags & LDLM_LRU_FLAG_AGED) {
1823                         if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
1824                                 return ldlm_cancel_aged_no_wait_policy;
1825                         else
1826                                 return ldlm_cancel_aged_policy;
1827                 }
1828         }
1829         if (lru_flags & LDLM_LRU_FLAG_NO_WAIT)
1830                 return ldlm_cancel_no_wait_policy;
1831
1832         return ldlm_cancel_default_policy;
1833 }
1834
1835 /**
1836  * - Free space in LRU for \a count new locks,
1837  *   redundant unused locks are canceled locally;
1838  * - also cancel locally unused aged locks;
1839  * - do not cancel more than \a max locks;
1840  * - GET the found locks and add them into the \a cancels list.
1841  *
1842  * A client lock can be added to the l_bl_ast list only when it is
1843  * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing
1844  * CANCEL.  There are the following use cases:
1845  * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and
1846  * ldlm_cli_cancel(), which check and set this flag properly. As any
1847  * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed
1848  * later without any special locking.
1849  *
1850  * Calling policies for enabled LRU resize:
1851  * ----------------------------------------
1852  * flags & LDLM_LRU_FLAG_LRUR - use LRU resize policy (SLV from server) to
1853  *                              cancel not more than \a count locks;
1854  *
1855  * flags & LDLM_LRU_FLAG_PASSED - cancel \a count number of old locks (located
1856  *                              at the beginning of LRU list);
1857  *
1858  * flags & LDLM_LRU_FLAG_SHRINK - cancel not more than \a count locks according
1859  *                              to memory pressre policy function;
1860  *
1861  * flags & LDLM_LRU_FLAG_AGED - cancel \a count locks according to "aged policy"
1862  *
1863  * flags & LDLM_LRU_FLAG_NO_WAIT - cancel as many unused locks as possible
1864  *                              (typically before replaying locks) w/o
1865  *                              sending any RPCs or waiting for any
1866  *                              outstanding RPC to complete.
1867  *
1868  * flags & LDLM_CANCEL_CLEANUP - when cancelling read locks, do not check for
1869  *                              other read locks covering the same pages, just
1870  *                              discard those pages.
1871  */
1872 static int ldlm_prepare_lru_list(struct ldlm_namespace *ns,
1873                                  struct list_head *cancels, int count, int max,
1874                                  enum ldlm_lru_flags lru_flags)
1875 {
1876         ldlm_cancel_lru_policy_t pf;
1877         int added = 0;
1878         int no_wait = lru_flags & LDLM_LRU_FLAG_NO_WAIT;
1879
1880         ENTRY;
1881
1882         if (!ns_connect_lru_resize(ns))
1883                 count += ns->ns_nr_unused - ns->ns_max_unused;
1884
1885         pf = ldlm_cancel_lru_policy(ns, lru_flags);
1886         LASSERT(pf != NULL);
1887
1888         /* For any flags, stop scanning if @max is reached. */
1889         while (!list_empty(&ns->ns_unused_list) && (max == 0 || added < max)) {
1890                 struct ldlm_lock *lock;
1891                 struct list_head *item, *next;
1892                 enum ldlm_policy_res result;
1893                 ktime_t last_use = ktime_set(0, 0);
1894
1895                 spin_lock(&ns->ns_lock);
1896                 item = no_wait ? ns->ns_last_pos : &ns->ns_unused_list;
1897                 for (item = item->next, next = item->next;
1898                      item != &ns->ns_unused_list;
1899                      item = next, next = item->next) {
1900                         lock = list_entry(item, struct ldlm_lock, l_lru);
1901
1902                         /* No locks which got blocking requests. */
1903                         LASSERT(!ldlm_is_bl_ast(lock));
1904
1905                         if (!ldlm_is_canceling(lock) &&
1906                             !ldlm_is_converting(lock))
1907                                 break;
1908
1909                         /* Somebody is already doing CANCEL. No need for this
1910                          * lock in LRU, do not traverse it again. */
1911                         ldlm_lock_remove_from_lru_nolock(lock);
1912                 }
1913                 if (item == &ns->ns_unused_list) {
1914                         spin_unlock(&ns->ns_lock);
1915                         break;
1916                 }
1917
1918                 last_use = lock->l_last_used;
1919
1920                 LDLM_LOCK_GET(lock);
1921                 spin_unlock(&ns->ns_lock);
1922                 lu_ref_add(&lock->l_reference, __FUNCTION__, current);
1923
1924                 /* Pass the lock through the policy filter and see if it
1925                  * should stay in LRU.
1926                  *
1927                  * Even for shrinker policy we stop scanning if
1928                  * we find a lock that should stay in the cache.
1929                  * We should take into account lock age anyway
1930                  * as a new lock is a valuable resource even if
1931                  * it has a low weight.
1932                  *
1933                  * That is, for shrinker policy we drop only
1934                  * old locks, but additionally choose them by
1935                  * their weight. Big extent locks will stay in
1936                  * the cache. */
1937                 result = pf(ns, lock, ns->ns_nr_unused, added, count);
1938                 if (result == LDLM_POLICY_KEEP_LOCK) {
1939                         lu_ref_del(&lock->l_reference, __func__, current);
1940                         LDLM_LOCK_RELEASE(lock);
1941                         break;
1942                 }
1943
1944                 if (result == LDLM_POLICY_SKIP_LOCK) {
1945                         lu_ref_del(&lock->l_reference, __func__, current);
1946                         if (no_wait) {
1947                                 spin_lock(&ns->ns_lock);
1948                                 if (!list_empty(&lock->l_lru) &&
1949                                     lock->l_lru.prev == ns->ns_last_pos)
1950                                         ns->ns_last_pos = &lock->l_lru;
1951                                 spin_unlock(&ns->ns_lock);
1952                         }
1953
1954                         LDLM_LOCK_RELEASE(lock);
1955                         continue;
1956                 }
1957
1958                 lock_res_and_lock(lock);
1959                 /* Check flags again under the lock. */
1960                 if (ldlm_is_canceling(lock) || ldlm_is_converting(lock) ||
1961                     ldlm_lock_remove_from_lru_check(lock, last_use) == 0) {
1962                         /* Another thread is removing lock from LRU, or
1963                          * somebody is already doing CANCEL, or there
1964                          * is a blocking request which will send cancel
1965                          * by itself, or the lock is no longer unused or
1966                          * the lock has been used since the pf() call and
1967                          * pages could be put under it. */
1968                         unlock_res_and_lock(lock);
1969                         lu_ref_del(&lock->l_reference, __FUNCTION__, current);
1970                         LDLM_LOCK_RELEASE(lock);
1971                         continue;
1972                 }
1973                 LASSERT(!lock->l_readers && !lock->l_writers);
1974
1975                 /* If we have chosen to cancel this lock voluntarily, we
1976                  * better send cancel notification to server, so that it
1977                  * frees appropriate state. This might lead to a race
1978                  * where while we are doing cancel here, server is also
1979                  * silently cancelling this lock. */
1980                 ldlm_clear_cancel_on_block(lock);
1981
1982                 /* Setting the CBPENDING flag is a little misleading,
1983                  * but prevents an important race; namely, once
1984                  * CBPENDING is set, the lock can accumulate no more
1985                  * readers/writers. Since readers and writers are
1986                  * already zero here, ldlm_lock_decref() won't see
1987                  * this flag and call l_blocking_ast */
1988                 lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING;
1989
1990                 if ((lru_flags & LDLM_LRU_FLAG_CLEANUP) &&
1991                     (lock->l_resource->lr_type == LDLM_EXTENT ||
1992                      ldlm_has_dom(lock)) && lock->l_granted_mode == LCK_PR)
1993                         ldlm_set_discard_data(lock);
1994
1995                 /* We can't re-add to l_lru as it confuses the
1996                  * refcounting in ldlm_lock_remove_from_lru() if an AST
1997                  * arrives after we drop lr_lock below. We use l_bl_ast
1998                  * and can't use l_pending_chain as it is used both on
1999                  * server and client nevertheless bug 5666 says it is
2000                  * used only on server */
2001                 LASSERT(list_empty(&lock->l_bl_ast));
2002                 list_add(&lock->l_bl_ast, cancels);
2003                 unlock_res_and_lock(lock);
2004                 lu_ref_del(&lock->l_reference, __FUNCTION__, current);
2005                 added++;
2006         }
2007         RETURN(added);
2008 }
2009
2010 int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
2011                           int count, int max,
2012                           enum ldlm_cancel_flags cancel_flags,
2013                           enum ldlm_lru_flags lru_flags)
2014 {
2015         int added;
2016
2017         added = ldlm_prepare_lru_list(ns, cancels, count, max, lru_flags);
2018         if (added <= 0)
2019                 return added;
2020
2021         return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
2022 }
2023
2024 /**
2025  * Cancel at least \a nr locks from given namespace LRU.
2026  *
2027  * When called with LCF_ASYNC the blocking callback will be handled
2028  * in a thread and this function will return after the thread has been
2029  * asked to call the callback.  When called with LCF_ASYNC the blocking
2030  * callback will be performed in this function.
2031  */
2032 int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr,
2033                     enum ldlm_cancel_flags cancel_flags,
2034                     enum ldlm_lru_flags lru_flags)
2035 {
2036         struct list_head cancels = LIST_HEAD_INIT(cancels);
2037         int count, rc;
2038         ENTRY;
2039
2040         /* Just prepare the list of locks, do not actually cancel them yet.
2041          * Locks are cancelled later in a separate thread. */
2042         count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, lru_flags);
2043         rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags);
2044         if (rc == 0)
2045                 RETURN(count);
2046
2047         RETURN(0);
2048 }
2049
2050 /**
2051  * Find and cancel locally unused locks found on resource, matched to the
2052  * given policy, mode. GET the found locks and add them into the \a cancels
2053  * list.
2054  */
2055 int ldlm_cancel_resource_local(struct ldlm_resource *res,
2056                                struct list_head *cancels,
2057                                union ldlm_policy_data *policy,
2058                                enum ldlm_mode mode, __u64 lock_flags,
2059                                enum ldlm_cancel_flags cancel_flags,
2060                                void *opaque)
2061 {
2062         struct ldlm_lock *lock;
2063         int count = 0;
2064
2065         ENTRY;
2066
2067         lock_res(res);
2068         list_for_each_entry(lock, &res->lr_granted, l_res_link) {
2069                 if (opaque != NULL && lock->l_ast_data != opaque) {
2070                         LDLM_ERROR(lock, "data %p doesn't match opaque %p",
2071                                    lock->l_ast_data, opaque);
2072                         continue;
2073                 }
2074
2075                 if (lock->l_readers || lock->l_writers)
2076                         continue;
2077
2078                 /* If somebody is already doing CANCEL, or blocking AST came,
2079                  * or lock is being converted then skip this lock. */
2080                 if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock) ||
2081                     ldlm_is_converting(lock))
2082                         continue;
2083
2084                 if (lockmode_compat(lock->l_granted_mode, mode))
2085                         continue;
2086
2087                 /* If policy is given and this is IBITS lock, add to list only
2088                  * those locks that match by policy.
2089                  * Skip locks with DoM bit always to don't flush data.
2090                  */
2091                 if (policy && (lock->l_resource->lr_type == LDLM_IBITS) &&
2092                     (!(lock->l_policy_data.l_inodebits.bits &
2093                       policy->l_inodebits.bits) || ldlm_has_dom(lock)))
2094                         continue;
2095
2096                 /* See CBPENDING comment in ldlm_cancel_lru */
2097                 lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
2098                                  lock_flags;
2099
2100                 LASSERT(list_empty(&lock->l_bl_ast));
2101                 list_add(&lock->l_bl_ast, cancels);
2102                 LDLM_LOCK_GET(lock);
2103                 count++;
2104         }
2105         unlock_res(res);
2106
2107         RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
2108 }
2109 EXPORT_SYMBOL(ldlm_cancel_resource_local);
2110
2111 /**
2112  * Cancel client-side locks from a list and send/prepare cancel RPCs to the
2113  * server.
2114  * If \a req is NULL, send CANCEL request to server with handles of locks
2115  * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests
2116  * separately per lock.
2117  * If \a req is not NULL, put handles of locks in \a cancels into the request
2118  * buffer at the offset \a off.
2119  * Destroy \a cancels at the end.
2120  */
2121 int ldlm_cli_cancel_list(struct list_head *cancels, int count,
2122                          struct ptlrpc_request *req,
2123                          enum ldlm_cancel_flags flags)
2124 {
2125         struct ldlm_lock *lock;
2126         int res = 0;
2127         ENTRY;
2128
2129         if (list_empty(cancels) || count == 0)
2130                 RETURN(0);
2131
2132         /* XXX: requests (both batched and not) could be sent in parallel.
2133          * Usually it is enough to have just 1 RPC, but it is possible that
2134          * there are too many locks to be cancelled in LRU or on a resource.
2135          * It would also speed up the case when the server does not support
2136          * the feature. */
2137         while (count > 0) {
2138                 LASSERT(!list_empty(cancels));
2139                 lock = list_entry(cancels->next, struct ldlm_lock,
2140                                       l_bl_ast);
2141                 LASSERT(lock->l_conn_export);
2142
2143                 if (exp_connect_cancelset(lock->l_conn_export)) {
2144                         res = count;
2145                         if (req)
2146                                 ldlm_cancel_pack(req, cancels, count);
2147                         else
2148                                 res = ldlm_cli_cancel_req(lock->l_conn_export,
2149                                                           cancels, count,
2150                                                           flags);
2151                 } else {
2152                         res = ldlm_cli_cancel_req(lock->l_conn_export,
2153                                                   cancels, 1, flags);
2154                 }
2155
2156                 if (res < 0) {
2157                         CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR,
2158                                      "ldlm_cli_cancel_list: %d\n", res);
2159                         res = count;
2160                 }
2161
2162                 count -= res;
2163                 ldlm_lock_list_put(cancels, l_bl_ast, res);
2164         }
2165         LASSERT(count == 0);
2166         RETURN(0);
2167 }
2168 EXPORT_SYMBOL(ldlm_cli_cancel_list);
2169
2170 /**
2171  * Cancel all locks on a resource that have 0 readers/writers.
2172  *
2173  * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
2174  * to notify the server. */
2175 int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
2176                                     const struct ldlm_res_id *res_id,
2177                                     union ldlm_policy_data *policy,
2178                                     enum ldlm_mode mode,
2179                                     enum ldlm_cancel_flags flags, void *opaque)
2180 {
2181         struct ldlm_resource *res;
2182         struct list_head cancels = LIST_HEAD_INIT(cancels);
2183         int count;
2184         int rc;
2185         ENTRY;
2186
2187         res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
2188         if (IS_ERR(res)) {
2189                 /* This is not a problem. */
2190                 CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]);
2191                 RETURN(0);
2192         }
2193
2194         LDLM_RESOURCE_ADDREF(res);
2195         count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
2196                                            0, flags | LCF_BL_AST, opaque);
2197         rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
2198         if (rc != ELDLM_OK)
2199                 CERROR("canceling unused lock "DLDLMRES": rc = %d\n",
2200                        PLDLMRES(res), rc);
2201
2202         LDLM_RESOURCE_DELREF(res);
2203         ldlm_resource_putref(res);
2204         RETURN(0);
2205 }
2206 EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
2207
2208 struct ldlm_cli_cancel_arg {
2209         int     lc_flags;
2210         void   *lc_opaque;
2211 };
2212
2213 static int
2214 ldlm_cli_hash_cancel_unused(struct cfs_hash *hs, struct cfs_hash_bd *bd,
2215                             struct hlist_node *hnode, void *arg)
2216 {
2217         struct ldlm_resource           *res = cfs_hash_object(hs, hnode);
2218         struct ldlm_cli_cancel_arg     *lc = arg;
2219
2220         ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
2221                                         NULL, LCK_MINMODE, lc->lc_flags,
2222                                         lc->lc_opaque);
2223         /* must return 0 for hash iteration */
2224         return 0;
2225 }
2226
2227 /**
2228  * Cancel all locks on a namespace (or a specific resource, if given)
2229  * that have 0 readers/writers.
2230  *
2231  * If flags & LCF_LOCAL, throw the locks away without trying
2232  * to notify the server. */
2233 int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
2234                            const struct ldlm_res_id *res_id,
2235                            enum ldlm_cancel_flags flags, void *opaque)
2236 {
2237         struct ldlm_cli_cancel_arg arg = {
2238                 .lc_flags       = flags,
2239                 .lc_opaque      = opaque,
2240         };
2241
2242         ENTRY;
2243
2244         if (ns == NULL)
2245                 RETURN(ELDLM_OK);
2246
2247         if (res_id != NULL) {
2248                 RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
2249                                                        LCK_MINMODE, flags,
2250                                                        opaque));
2251         } else {
2252                 cfs_hash_for_each_nolock(ns->ns_rs_hash,
2253                                          ldlm_cli_hash_cancel_unused, &arg, 0);
2254                 RETURN(ELDLM_OK);
2255         }
2256 }
2257
2258 /* Lock iterators. */
2259
2260 int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
2261                           void *closure)
2262 {
2263         struct list_head *tmp, *next;
2264         struct ldlm_lock *lock;
2265         int rc = LDLM_ITER_CONTINUE;
2266
2267         ENTRY;
2268
2269         if (!res)
2270                 RETURN(LDLM_ITER_CONTINUE);
2271
2272         lock_res(res);
2273         list_for_each_safe(tmp, next, &res->lr_granted) {
2274                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
2275
2276                 if (iter(lock, closure) == LDLM_ITER_STOP)
2277                         GOTO(out, rc = LDLM_ITER_STOP);
2278         }
2279
2280         list_for_each_safe(tmp, next, &res->lr_waiting) {
2281                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
2282
2283                 if (iter(lock, closure) == LDLM_ITER_STOP)
2284                         GOTO(out, rc = LDLM_ITER_STOP);
2285         }
2286 out:
2287         unlock_res(res);
2288         RETURN(rc);
2289 }
2290
2291 struct iter_helper_data {
2292         ldlm_iterator_t iter;
2293         void *closure;
2294 };
2295
2296 static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
2297 {
2298         struct iter_helper_data *helper = closure;
2299         return helper->iter(lock, helper->closure);
2300 }
2301
2302 static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd,
2303                                 struct hlist_node *hnode, void *arg)
2304
2305 {
2306         struct ldlm_resource *res = cfs_hash_object(hs, hnode);
2307
2308         return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
2309                LDLM_ITER_STOP;
2310 }
2311
2312 void ldlm_namespace_foreach(struct ldlm_namespace *ns,
2313                             ldlm_iterator_t iter, void *closure)
2314
2315 {
2316         struct iter_helper_data helper = { .iter = iter, .closure = closure };
2317
2318         cfs_hash_for_each_nolock(ns->ns_rs_hash,
2319                                  ldlm_res_iter_helper, &helper, 0);
2320
2321 }
2322
2323 /* non-blocking function to manipulate a lock whose cb_data is being put away.
2324  * return  0:  find no resource
2325  *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
2326  *       < 0:  errors
2327  */
2328 int ldlm_resource_iterate(struct ldlm_namespace *ns,
2329                           const struct ldlm_res_id *res_id,
2330                           ldlm_iterator_t iter, void *data)
2331 {
2332         struct ldlm_resource *res;
2333         int rc;
2334         ENTRY;
2335
2336         LASSERTF(ns != NULL, "must pass in namespace\n");
2337
2338         res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
2339         if (IS_ERR(res))
2340                 RETURN(0);
2341
2342         LDLM_RESOURCE_ADDREF(res);
2343         rc = ldlm_resource_foreach(res, iter, data);
2344         LDLM_RESOURCE_DELREF(res);
2345         ldlm_resource_putref(res);
2346         RETURN(rc);
2347 }
2348 EXPORT_SYMBOL(ldlm_resource_iterate);
2349
2350 /* Lock replay */
2351
2352 static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
2353 {
2354         struct list_head *list = closure;
2355
2356         /* we use l_pending_chain here, because it's unused on clients. */
2357         LASSERTF(list_empty(&lock->l_pending_chain),
2358                  "lock %p next %p prev %p\n",
2359                  lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
2360         /* bug 9573: don't replay locks left after eviction, or
2361          * bug 17614: locks being actively cancelled. Get a reference
2362          * on a lock so that it does not disapear under us (e.g. due to cancel)
2363          */
2364         if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_BL_DONE))) {
2365                 list_add(&lock->l_pending_chain, list);
2366                 LDLM_LOCK_GET(lock);
2367         }
2368
2369         return LDLM_ITER_CONTINUE;
2370 }
2371
2372 static int replay_lock_interpret(const struct lu_env *env,
2373                                  struct ptlrpc_request *req, void *args, int rc)
2374 {
2375         struct ldlm_async_args *aa = args;
2376         struct ldlm_lock     *lock;
2377         struct ldlm_reply    *reply;
2378         struct obd_export    *exp;
2379
2380         ENTRY;
2381         atomic_dec(&req->rq_import->imp_replay_inflight);
2382         if (rc != ELDLM_OK)
2383                 GOTO(out, rc);
2384
2385         reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP);
2386         if (reply == NULL)
2387                 GOTO(out, rc = -EPROTO);
2388
2389         lock = ldlm_handle2lock(&aa->lock_handle);
2390         if (!lock) {
2391                 CERROR("received replay ack for unknown local cookie %#llx"
2392                        " remote cookie %#llx from server %s id %s\n",
2393                        aa->lock_handle.cookie, reply->lock_handle.cookie,
2394                        req->rq_export->exp_client_uuid.uuid,
2395                        libcfs_id2str(req->rq_peer));
2396                 GOTO(out, rc = -ESTALE);
2397         }
2398
2399         /* Key change rehash lock in per-export hash with new key */
2400         exp = req->rq_export;
2401         if (exp && exp->exp_lock_hash) {
2402                 /* In the function below, .hs_keycmp resolves to
2403                  * ldlm_export_lock_keycmp() */
2404                 /* coverity[overrun-buffer-val] */
2405                 cfs_hash_rehash_key(exp->exp_lock_hash,
2406                                     &lock->l_remote_handle,
2407                                     &reply->lock_handle,
2408                                     &lock->l_exp_hash);
2409         } else {
2410                 lock->l_remote_handle = reply->lock_handle;
2411         }
2412
2413         LDLM_DEBUG(lock, "replayed lock:");
2414         ptlrpc_import_recovery_state_machine(req->rq_import);
2415         LDLM_LOCK_PUT(lock);
2416 out:
2417         if (rc != ELDLM_OK)
2418                 ptlrpc_connect_import(req->rq_import);
2419
2420         RETURN(rc);
2421 }
2422
2423 static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
2424 {
2425         struct ptlrpc_request *req;
2426         struct ldlm_async_args *aa;
2427         struct ldlm_request   *body;
2428         int flags;
2429         ENTRY;
2430
2431
2432         /* Bug 11974: Do not replay a lock which is actively being canceled */
2433         if (ldlm_is_bl_done(lock)) {
2434                 LDLM_DEBUG(lock, "Not replaying canceled lock:");
2435                 RETURN(0);
2436         }
2437
2438         /* If this is reply-less callback lock, we cannot replay it, since
2439          * server might have long dropped it, but notification of that event was
2440          * lost by network. (and server granted conflicting lock already) */
2441         if (ldlm_is_cancel_on_block(lock)) {
2442                 LDLM_DEBUG(lock, "Not replaying reply-less lock:");
2443                 ldlm_lock_cancel(lock);
2444                 RETURN(0);
2445         }
2446
2447         /*
2448          * If granted mode matches the requested mode, this lock is granted.
2449          *
2450          * If we haven't been granted anything and are on a resource list,
2451          * then we're blocked/waiting.
2452          *
2453          * If we haven't been granted anything and we're NOT on a resource list,
2454          * then we haven't got a reply yet and don't have a known disposition.
2455          * This happens whenever a lock enqueue is the request that triggers
2456          * recovery.
2457          */
2458         if (ldlm_is_granted(lock))
2459                 flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
2460         else if (!list_empty(&lock->l_res_link))
2461                 flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
2462         else
2463                 flags = LDLM_FL_REPLAY;
2464
2465         req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE,
2466                                         LUSTRE_DLM_VERSION, LDLM_ENQUEUE);
2467         if (req == NULL)
2468                 RETURN(-ENOMEM);
2469
2470         /* We're part of recovery, so don't wait for it. */
2471         req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS;
2472
2473         body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
2474         ldlm_lock2desc(lock, &body->lock_desc);
2475         body->lock_flags = ldlm_flags_to_wire(flags);
2476
2477         ldlm_lock2handle(lock, &body->lock_handle[0]);
2478         if (lock->l_lvb_len > 0)
2479                 req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB);
2480         req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
2481                              lock->l_lvb_len);
2482         ptlrpc_request_set_replen(req);
2483         /* notify the server we've replayed all requests.
2484          * also, we mark the request to be put on a dedicated
2485          * queue to be processed after all request replayes.
2486          * bug 6063 */
2487         lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE);
2488
2489         LDLM_DEBUG(lock, "replaying lock:");
2490
2491         atomic_inc(&req->rq_import->imp_replay_inflight);
2492         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
2493         aa = ptlrpc_req_async_args(req);
2494         aa->lock_handle = body->lock_handle[0];
2495         req->rq_interpret_reply = replay_lock_interpret;
2496         ptlrpcd_add_req(req);
2497
2498         RETURN(0);
2499 }
2500
2501 /**
2502  * Cancel as many unused locks as possible before replay. since we are
2503  * in recovery, we can't wait for any outstanding RPCs to send any RPC
2504  * to the server.
2505  *
2506  * Called only in recovery before replaying locks. there is no need to
2507  * replay locks that are unused. since the clients may hold thousands of
2508  * cached unused locks, dropping the unused locks can greatly reduce the
2509  * load on the servers at recovery time.
2510  */
2511 static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
2512 {
2513         int canceled;
2514         struct list_head cancels = LIST_HEAD_INIT(cancels);
2515
2516         CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
2517                            "replay for namespace %s (%d)\n",
2518                            ldlm_ns_name(ns), ns->ns_nr_unused);
2519
2520         /* We don't need to care whether or not LRU resize is enabled
2521          * because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the
2522          * count parameter */
2523         canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
2524                                          LCF_LOCAL, LDLM_LRU_FLAG_NO_WAIT);
2525
2526         CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
2527                            canceled, ldlm_ns_name(ns));
2528 }
2529
2530 int ldlm_replay_locks(struct obd_import *imp)
2531 {
2532         struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
2533         struct list_head list = LIST_HEAD_INIT(list);
2534         struct ldlm_lock *lock, *next;
2535         int rc = 0;
2536
2537         ENTRY;
2538
2539         LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
2540
2541         /* don't replay locks if import failed recovery */
2542         if (imp->imp_vbr_failed)
2543                 RETURN(0);
2544
2545         /* ensure this doesn't fall to 0 before all have been queued */
2546         atomic_inc(&imp->imp_replay_inflight);
2547
2548         if (ldlm_cancel_unused_locks_before_replay)
2549                 ldlm_cancel_unused_locks_for_replay(ns);
2550
2551         ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
2552
2553         list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
2554                 list_del_init(&lock->l_pending_chain);
2555                 if (rc) {
2556                         LDLM_LOCK_RELEASE(lock);
2557                         continue; /* or try to do the rest? */
2558                 }
2559                 rc = replay_one_lock(imp, lock);
2560                 LDLM_LOCK_RELEASE(lock);
2561         }
2562
2563         atomic_dec(&imp->imp_replay_inflight);
2564
2565         RETURN(rc);
2566 }