LU-1347 ldlm: makes EXPORT_SYMBOL follows function body

[fs/lustre-release.git] / lustre / ldlm / ldlm_request.c
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index b6fb1d1..dd561dc 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -1,6 +1,4 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
   * GPL HEADER START
   *
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -26,8 +24,10 @@
   * GPL HEADER END
   */
  /*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
+ *
+ * Copyright (c) 2010, 2011, Whamcloud, Inc.
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
@@ -50,6 +50,9 @@ int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
  CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
                  "lock enqueue timeout minimum");
  
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
  static void interrupted_completion_wait(void *data)
  {
  }
@@ -87,7 +90,7 @@ int ldlm_expired_completion_wait(void *data)
                          last_dump = next_dump;
                          next_dump = cfs_time_shift(300);
                          ldlm_namespace_dump(D_DLMTRACE,
-                                            lock->l_resource->lr_namespace);
+                                            ldlm_lock_to_ns(lock));
                          if (last_dump == 0)
                                  libcfs_debug_dumplog();
                  }
@@ -105,18 +108,19 @@ int ldlm_expired_completion_wait(void *data)
  
          RETURN(0);
  }
+EXPORT_SYMBOL(ldlm_expired_completion_wait);
  
  /* We use the same basis for both server side and client side functions
     from a single node. */
  int ldlm_get_enq_timeout(struct ldlm_lock *lock)
  {
-        int timeout = at_get(&lock->l_resource->lr_namespace->ns_at_estimate);
+        int timeout = at_get(ldlm_lock_to_ns_at(lock));
          if (AT_OFF)
                  return obd_timeout / 2;
          /* Since these are non-updating timeouts, we should be conservative.
             It would be nice to have some kind of "early reply" mechanism for
             lock callbacks too... */
-        timeout = timeout + (timeout >> 1); /* 150% */
+        timeout = min_t(int, at_max, timeout + (timeout >> 1)); /* 150% */
          return max(timeout, ldlm_enqueue_min);
  }
  EXPORT_SYMBOL(ldlm_get_enq_timeout);
@@ -140,7 +144,8 @@ static int ldlm_completion_tail(struct ldlm_lock *lock)
                             CFS_DURATION_T"s", delay);
  
                  /* Update our time estimate */
-                at_add(&lock->l_resource->lr_namespace->ns_at_estimate, delay);
+                at_measured(ldlm_lock_to_ns_at(lock),
+                            delay);
                  result = 0;
          }
          return result;
@@ -168,10 +173,10 @@ int ldlm_completion_ast_async(struct ldlm_lock *lock, int flags, void *data)
  
          LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
                     "going forward");
-        ldlm_lock_dump(D_OTHER, lock, 0);
          ldlm_reprocess_all(lock->l_resource);
          RETURN(0);
  }
+EXPORT_SYMBOL(ldlm_completion_ast_async);
  
  /**
   * Client side LDLM "completion" AST. This is called in several cases:
@@ -218,7 +223,6 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
  
          LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
                     "sleeping");
-        ldlm_lock_dump(D_OTHER, lock, 0);
  
  noreproc:
  
@@ -246,12 +250,12 @@ noreproc:
          }
  
          if (imp != NULL) {
-                spin_lock(&imp->imp_lock);
+                cfs_spin_lock(&imp->imp_lock);
                  lwd.lwd_conn_cnt = imp->imp_conn_cnt;
-                spin_unlock(&imp->imp_lock);
+                cfs_spin_unlock(&imp->imp_lock);
          }
  
-        if (ns_is_client(lock->l_resource->lr_namespace) &&
+        if (ns_is_client(ldlm_lock_to_ns(lock)) &&
              OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
                                   OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
                  lock->l_flags |= LDLM_FL_FAIL_LOC;
@@ -270,6 +274,7 @@ noreproc:
  
          RETURN(ldlm_completion_tail(lock));
  }
+EXPORT_SYMBOL(ldlm_completion_ast);
  
  /**
   * A helper to build a blocking ast function
@@ -306,6 +311,7 @@ int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock)
          }
          RETURN(0);
  }
+EXPORT_SYMBOL(ldlm_blocking_ast_nocheck);
  
  /**
   * Server blocking AST
@@ -333,7 +339,7 @@ int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
          lock_res_and_lock(lock);
          /* Get this: if ldlm_blocking_ast is racing with intent_policy, such
           * that ldlm_blocking_ast is called just before intent_policy method
-         * takes the ns_lock, then by the time we get the lock, we might not
+         * takes the lr_lock, then by the time we get the lock, we might not
           * be the correct blocking function anymore.  So check, and return
           * early, if so. */
          if (lock->l_blocking_ast != ldlm_blocking_ast) {
@@ -342,6 +348,7 @@ int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
          }
          RETURN(ldlm_blocking_ast_nocheck(lock));
  }
+EXPORT_SYMBOL(ldlm_blocking_ast);
  
  /*
   * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See
@@ -369,6 +376,7 @@ int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp)
           */
          return -ELDLM_NO_LOCK_DATA;
  }
+EXPORT_SYMBOL(ldlm_glimpse_ast);
  
  int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
                             const struct ldlm_res_id *res_id,
@@ -377,7 +385,7 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
                             ldlm_blocking_callback blocking,
                             ldlm_completion_callback completion,
                             ldlm_glimpse_callback glimpse,
-                           void *data, __u32 lvb_len, void *lvb_swabber,
+                           void *data, __u32 lvb_len,
                             const __u64 *client_cookie,
                             struct lustre_handle *lockh)
  {
@@ -398,16 +406,16 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
          lock = ldlm_lock_create(ns, res_id, type, mode, &cbs, data, lvb_len);
          if (unlikely(!lock))
                  GOTO(out_nolock, err = -ENOMEM);
-        LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
  
-        ldlm_lock_addref_internal(lock, mode);
          ldlm_lock2handle(lock, lockh);
-        lock_res_and_lock(lock);
+
+        /* NB: we don't have any lock now (lock_res_and_lock)
+         * because it's a new lock */
+        ldlm_lock_addref_internal_nolock(lock, mode);
          lock->l_flags |= LDLM_FL_LOCAL;
          if (*flags & LDLM_FL_ATOMIC_CB)
                  lock->l_flags |= LDLM_FL_ATOMIC_CB;
-        lock->l_lvb_swabber = lvb_swabber;
-        unlock_res_and_lock(lock);
+
          if (policy != NULL)
                  lock->l_policy_data = *policy;
          if (client_cookie != NULL)
@@ -422,23 +430,20 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
          if (policy != NULL)
                  *policy = lock->l_policy_data;
  
-        LDLM_DEBUG_NOLOCK("client-side local enqueue handler END (lock %p)",
-                          lock);
-
          if (lock->l_completion_ast)
                  lock->l_completion_ast(lock, *flags, NULL);
  
-        LDLM_DEBUG(lock, "client-side local enqueue END");
+        LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
          EXIT;
   out:
          LDLM_LOCK_RELEASE(lock);
   out_nolock:
          return err;
  }
+EXPORT_SYMBOL(ldlm_cli_enqueue_local);
  
  static void failed_lock_cleanup(struct ldlm_namespace *ns,
-                                struct ldlm_lock *lock,
-                                struct lustre_handle *lockh, int mode)
+                                struct ldlm_lock *lock, int mode)
  {
          int need_cancel = 0;
  
@@ -451,38 +456,44 @@ static void failed_lock_cleanup(struct ldlm_namespace *ns,
                   * bl_ast and -EINVAL reply is sent to server anyways.
                   * bug 17645 */
                  lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED |
-                                 LDLM_FL_ATOMIC_CB;
+                                 LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING;
                  need_cancel = 1;
          }
          unlock_res_and_lock(lock);
  
-        if (need_cancel) {
+        if (need_cancel)
                  LDLM_DEBUG(lock,
                             "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | "
-                           "LDLM_FL_ATOMIC_CB");
-                ldlm_lock_decref_and_cancel(lockh, mode);
-        } else {
+                           "LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING");
+        else
                  LDLM_DEBUG(lock, "lock was granted or failed in race");
-                ldlm_lock_decref(lockh, mode);
-        }
+
+        ldlm_lock_decref_internal(lock, mode);
  
          /* XXX - HACK because we shouldn't call ldlm_lock_destroy()
           *       from llite/file.c/ll_file_flock(). */
+        /* This code makes for the fact that we do not have blocking handler on
+         * a client for flock locks. As such this is the place where we must
+         * completely kill failed locks. (interrupted and those that
+         * were waiting to be granted when server evicted us. */
          if (lock->l_resource->lr_type == LDLM_FLOCK) {
-                ldlm_lock_destroy(lock);
+                lock_res_and_lock(lock);
+                ldlm_resource_unlink_lock(lock);
+                ldlm_lock_destroy_nolock(lock);
+                unlock_res_and_lock(lock);
          }
  }
  
  int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
                            ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode,
                            int *flags, void *lvb, __u32 lvb_len,
-                          void *lvb_swabber, struct lustre_handle *lockh,int rc)
+                          struct lustre_handle *lockh,int rc)
  {
          struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
          int is_replay = *flags & LDLM_FL_REPLAY;
-        struct lustre_handle old_hash_key;
          struct ldlm_lock *lock;
          struct ldlm_reply *reply;
+        struct ost_lvb *tmplvb;
          int cleanup_phase = 1;
          ENTRY;
  
@@ -504,14 +515,12 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
                          if (reply == NULL)
                                  rc = -EPROTO;
                          if (lvb_len) {
-                                struct ost_lvb *tmplvb;
  
                                  req_capsule_set_size(&req->rq_pill,
                                                       &RMF_DLM_LVB, RCL_SERVER,
                                                       lvb_len);
-                            tmplvb = req_capsule_server_swab_get(&req->rq_pill,
-                                                                 &RMF_DLM_LVB,
-                                                                 lvb_swabber);
+                                tmplvb = req_capsule_server_get(&req->rq_pill,
+                                                                 &RMF_DLM_LVB);
                                  if (tmplvb == NULL)
                                          GOTO(cleanup, rc = -EPROTO);
                                  if (lvb != NULL)
@@ -529,14 +538,18 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
          cleanup_phase = 0;
  
          lock_res_and_lock(lock);
-        old_hash_key = lock->l_remote_handle;
-        lock->l_remote_handle = reply->lock_handle;
-
          /* Key change rehash lock in per-export hash with new key */
-        if (exp->exp_lock_hash)
-                lustre_hash_rehash_key(exp->exp_lock_hash, &old_hash_key,
-                                       &lock->l_remote_handle,
-                                       &lock->l_exp_hash);
+        if (exp->exp_lock_hash) {
+               /* In the function below, .hs_keycmp resolves to
+                * ldlm_export_lock_keycmp() */
+               /* coverity[overrun-buffer-val] */
+                cfs_hash_rehash_key(exp->exp_lock_hash,
+                                    &lock->l_remote_handle,
+                                    &reply->lock_handle,
+                                    &lock->l_exp_hash);
+        } else {
+                lock->l_remote_handle = reply->lock_handle;
+        }
  
          *flags = reply->lock_flags;
          lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS;
@@ -582,8 +595,11 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
                  if (with_policy)
                          if (!(type == LDLM_IBITS && !(exp->exp_connect_flags &
                                                      OBD_CONNECT_IBITS)))
-                                lock->l_policy_data =
-                                                 reply->lock_desc.l_policy_data;
+                                /* We assume lock type cannot change on server*/
+                                ldlm_convert_policy_to_local(exp,
+                                                lock->l_resource->lr_type,
+                                                &reply->lock_desc.l_policy_data,
+                                                &lock->l_policy_data);
                  if (type != LDLM_PLAIN)
                          LDLM_DEBUG(lock,"client-side enqueue, new policy data");
          }
@@ -601,17 +617,25 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
  
          /* If the lock has already been granted by a completion AST, don't
           * clobber the LVB with an older one. */
-        if (lvb_len && (lock->l_req_mode != lock->l_granted_mode)) {
-                void *tmplvb;
+        if (lvb_len) {
+                /* We must lock or a racing completion might update lvb
+                   without letting us know and we'll clobber the correct value.
+                   Cannot unlock after the check either, a that still leaves
+                   a tiny window for completion to get in */
+                lock_res_and_lock(lock);
+                if (lock->l_req_mode != lock->l_granted_mode) {
  
-                req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
-                                     lvb_len);
-                tmplvb = req_capsule_server_swab_get(&req->rq_pill,
-                                                     &RMF_DLM_LVB,
-                                                     lvb_swabber);
-                if (tmplvb == NULL)
-                        GOTO(cleanup, rc = -EPROTO);
-                memcpy(lock->l_lvb_data, tmplvb, lvb_len);
+                        req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB,
+                                             RCL_SERVER, lvb_len);
+                        tmplvb = req_capsule_server_get(&req->rq_pill,
+                                                             &RMF_DLM_LVB);
+                        if (tmplvb == NULL) {
+                                unlock_res_and_lock(lock);
+                                GOTO(cleanup, rc = -EPROTO);
+                        }
+                        memcpy(lock->l_lvb_data, tmplvb, lvb_len);
+                }
+                unlock_res_and_lock(lock);
          }
  
          if (!is_replay) {
@@ -620,7 +644,7 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
                          int err = lock->l_completion_ast(lock, *flags, NULL);
                          if (!rc)
                                  rc = err;
-                        if (rc && type != LDLM_FLOCK) /* bug 9425, bug 10250 */
+                        if (rc)
                                  cleanup_phase = 1;
                  }
          }
@@ -635,12 +659,13 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
          EXIT;
  cleanup:
          if (cleanup_phase == 1 && rc)
-                failed_lock_cleanup(ns, lock, lockh, mode);
+                failed_lock_cleanup(ns, lock, mode);
          /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */
          LDLM_LOCK_PUT(lock);
          LDLM_LOCK_RELEASE(lock);
          return rc;
  }
+EXPORT_SYMBOL(ldlm_cli_enqueue_fini);
  
  /* PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into
   * a single page on the send/receive side. XXX: 512 should be changed
@@ -679,12 +704,12 @@ static inline int ldlm_format_handles_avail(struct obd_import *imp,
   * @count locks in @cancels. */
  int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
                        int version, int opc, int canceloff,
-                      struct list_head *cancels, int count)
+                      cfs_list_t *cancels, int count)
  {
          struct ldlm_namespace   *ns = exp->exp_obd->obd_namespace;
          struct req_capsule      *pill = &req->rq_pill;
          struct ldlm_request     *dlm = NULL;
-        int flags, avail, to_free, bufcount, pack = 0;
+        int flags, avail, to_free, pack = 0;
          CFS_LIST_HEAD(head);
          int rc;
          ENTRY;
@@ -693,7 +718,7 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
                  cancels = &head;
          if (exp_connect_cancelset(exp)) {
                  /* Estimate the amount of available space in the request. */
-                bufcount = req_capsule_filled_sizes(pill, RCL_CLIENT);
+                req_capsule_filled_sizes(pill, RCL_CLIENT);
                  avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff);
  
                  flags = ns_connect_lru_resize(ns) ?
@@ -740,13 +765,15 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req,
          }
          RETURN(0);
  }
+EXPORT_SYMBOL(ldlm_prep_elc_req);
  
  int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
-                          struct list_head *cancels, int count)
+                          cfs_list_t *cancels, int count)
  {
          return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE,
                                   LDLM_ENQUEUE_CANCEL_OFF, cancels, count);
  }
+EXPORT_SYMBOL(ldlm_prep_enqueue_req);
  
  /* If a request has some specific initialisation it is passed in @reqp,
   * otherwise it is created in ldlm_cli_enqueue.
@@ -757,9 +784,9 @@ int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req,
  int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
                       struct ldlm_enqueue_info *einfo,
                       const struct ldlm_res_id *res_id,
-                     ldlm_policy_data_t *policy, int *flags,
-                     void *lvb, __u32 lvb_len, void *lvb_swabber,
-                     struct lustre_handle *lockh, int async)
+                     ldlm_policy_data_t const *policy, int *flags,
+                     void *lvb, __u32 lvb_len, struct lustre_handle *lockh,
+                     int async)
  {
          struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
          struct ldlm_lock      *lock;
@@ -794,7 +821,6 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
                  /* for the local lock, add the reference */
                  ldlm_lock_addref_internal(lock, einfo->ei_mode);
                  ldlm_lock2handle(lock, lockh);
-                lock->l_lvb_swabber = lvb_swabber;
                  if (policy != NULL) {
                          /* INODEBITS_INTEROP: If the server does not support
                           * inodebits, we will request a plain lock in the
@@ -815,6 +841,11 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
                  LDLM_DEBUG(lock, "client-side enqueue START");
          }
  
+       lock->l_conn_export = exp;
+       lock->l_export = NULL;
+       lock->l_blocking_ast = einfo->ei_cb_bl;
+       lock->l_flags |= (*flags & LDLM_FL_NO_LRU);
+
          /* lock not sent to server yet */
  
          if (reqp == NULL || *reqp == NULL) {
@@ -823,7 +854,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
                                                  LUSTRE_DLM_VERSION,
                                                  LDLM_ENQUEUE);
                  if (req == NULL) {
-                        failed_lock_cleanup(ns, lock, lockh, einfo->ei_mode);
+                        failed_lock_cleanup(ns, lock, einfo->ei_mode);
                          LDLM_LOCK_RELEASE(lock);
                          RETURN(-ENOMEM);
                  }
@@ -840,10 +871,6 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
                           DLM_LOCKREQ_OFF, len, (int)sizeof(*body));
          }
  
-        lock->l_conn_export = exp;
-        lock->l_export = NULL;
-        lock->l_blocking_ast = einfo->ei_cb_bl;
-
          /* Dump lock data into the request buffer */
          body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ);
          ldlm_lock2desc(lock, &body->lock_desc);
@@ -880,7 +907,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
  
          err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
                                      einfo->ei_mode, flags, lvb, lvb_len,
-                                    lvb_swabber, lockh, rc);
+                                    lockh, rc);
  
          /* If ldlm_cli_enqueue_fini did not find the lock, we need to free
           * one reference that we took */
@@ -897,6 +924,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
  
          RETURN(rc);
  }
+EXPORT_SYMBOL(ldlm_cli_enqueue);
  
  static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
                                    __u32 *flags)
@@ -904,7 +932,7 @@ static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
          struct ldlm_resource *res;
          int rc;
          ENTRY;
-        if (ns_is_client(lock->l_resource->lr_namespace)) {
+        if (ns_is_client(ldlm_lock_to_ns(lock))) {
                  CERROR("Trying to cancel local lock\n");
                  LBUG();
          }
@@ -995,6 +1023,7 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, __u32 *flags)
          ptlrpc_req_finished(req);
          return rc;
  }
+EXPORT_SYMBOL(ldlm_cli_convert);
  
  /* Cancel locks locally.
   * Returns:
@@ -1027,14 +1056,13 @@ static int ldlm_cli_cancel_local(struct ldlm_lock *lock)
                  }
                  ldlm_lock_cancel(lock);
          } else {
-                if (ns_is_client(lock->l_resource->lr_namespace)) {
+                if (ns_is_client(ldlm_lock_to_ns(lock))) {
                          LDLM_ERROR(lock, "Trying to cancel local lock");
                          LBUG();
                  }
                  LDLM_DEBUG(lock, "server-side local cancel");
                  ldlm_lock_cancel(lock);
                  ldlm_reprocess_all(lock->l_resource);
-                LDLM_DEBUG(lock, "server-side local cancel handler END");
          }
  
          RETURN(rc);
@@ -1043,7 +1071,7 @@ static int ldlm_cli_cancel_local(struct ldlm_lock *lock)
  /* Pack @count locks in @head into ldlm_request buffer at the offset @off,
     of the request @req. */
  static void ldlm_cancel_pack(struct ptlrpc_request *req,
-                             struct list_head *head, int count)
+                             cfs_list_t *head, int count)
  {
          struct ldlm_request *dlm;
          struct ldlm_lock *lock;
@@ -1063,7 +1091,7 @@ static void ldlm_cancel_pack(struct ptlrpc_request *req,
          /* XXX: it would be better to pack lock handles grouped by resource.
           * so that the server cancel would call filter_lvbo_update() less
           * frequently. */
-        list_for_each_entry(lock, head, l_bl_ast) {
+        cfs_list_for_each_entry(lock, head, l_bl_ast) {
                  if (!count--)
                          break;
                  LASSERT(lock->l_conn_export);
@@ -1078,8 +1106,8 @@ static void ldlm_cancel_pack(struct ptlrpc_request *req,
  
  /* Prepare and send a batched cancel rpc, it will include count lock handles
   * of locks given in @head. */
-int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
-                        int count, int flags)
+int ldlm_cli_cancel_req(struct obd_export *exp, cfs_list_t *cancels,
+                        int count, ldlm_cancel_flags_t flags)
  {
          struct ptlrpc_request *req = NULL;
          struct obd_import *imp;
@@ -1090,9 +1118,9 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
          LASSERT(exp != NULL);
          LASSERT(count > 0);
  
-        OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, obd_fail_val);
+        CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val);
  
-        if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
+        if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE))
                  RETURN(count);
  
          free = ldlm_format_handles_avail(class_exp2cliimp(exp),
@@ -1101,8 +1129,6 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
                  count = free;
  
          while (1) {
-                int bufcount;
-
                  imp = class_exp2cliimp(exp);
                  if (imp == NULL || imp->imp_invalid) {
                          CDEBUG(D_DLMTRACE,
@@ -1114,7 +1140,7 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
                  if (req == NULL)
                          GOTO(out, rc = -ENOMEM);
  
-                bufcount = req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
+                req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT);
                  req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT,
                                       ldlm_request_bufsize(count, LDLM_CANCEL));
  
@@ -1133,8 +1159,8 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
                  ldlm_cancel_pack(req, cancels, count);
  
                  ptlrpc_request_set_replen(req);
-                if (flags & LDLM_FL_ASYNC) {
-                        ptlrpcd_add_req(req, PSCOPE_OTHER);
+                if (flags & LCF_ASYNC) {
+                        ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
                          sent = count;
                          GOTO(out, 0);
                  } else {
@@ -1164,6 +1190,7 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
  out:
          return sent ? sent : rc;
  }
+EXPORT_SYMBOL(ldlm_cli_cancel_req);
  
  static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
  {
@@ -1177,7 +1204,7 @@ static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp)
  int ldlm_cli_update_pool(struct ptlrpc_request *req)
  {
          struct obd_device *obd;
-        __u64 old_slv, new_slv;
+        __u64 new_slv;
          __u32 new_limit;
          ENTRY;
          if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
@@ -1216,11 +1243,10 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req)
           * alive in cleanup time. Evil races are possible which may cause
           * oops in that time.
           */
-        write_lock(&obd->obd_pool_lock);
-        old_slv = obd->obd_pool_slv;
+        cfs_write_lock(&obd->obd_pool_lock);
          obd->obd_pool_slv = new_slv;
          obd->obd_pool_limit = new_limit;
-        write_unlock(&obd->obd_pool_lock);
+        cfs_write_unlock(&obd->obd_pool_lock);
  
          RETURN(0);
  }
@@ -1250,8 +1276,8 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
          /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL
           * rpc which goes to canceld portal, so we can cancel other lru locks
           * here and send them all as one LDLM_CANCEL rpc. */
-        LASSERT(list_empty(&lock->l_bl_ast));
-        list_add(&lock->l_bl_ast, &cancels);
+        LASSERT(cfs_list_empty(&lock->l_bl_ast));
+        cfs_list_add(&lock->l_bl_ast, &cancels);
  
          exp = lock->l_conn_export;
          if (exp_connect_cancelset(exp)) {
@@ -1260,45 +1286,47 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
                                                    RCL_CLIENT, 0);
                  LASSERT(avail > 0);
  
-                ns = lock->l_resource->lr_namespace;
+                ns = ldlm_lock_to_ns(lock);
                  flags = ns_connect_lru_resize(ns) ?
                          LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
                  count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
-                                               LDLM_FL_BL_AST, flags);
+                                               LCF_BL_AST, flags);
          }
          ldlm_cli_cancel_list(&cancels, count, NULL, 0);
          RETURN(0);
  }
+EXPORT_SYMBOL(ldlm_cli_cancel);
  
  /* XXX until we will have compound requests and can cut cancels from generic rpc
   * we need send cancels with LDLM_FL_BL_AST flag as separate rpc */
-static int ldlm_cancel_list(struct list_head *cancels, int count, int flags)
+int ldlm_cli_cancel_list_local(cfs_list_t *cancels, int count,
+                               ldlm_cancel_flags_t flags)
  {
          CFS_LIST_HEAD(head);
          struct ldlm_lock *lock, *next;
          int left = 0, bl_ast = 0, rc;
  
          left = count;
-        list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
+        cfs_list_for_each_entry_safe(lock, next, cancels, l_bl_ast) {
                  if (left-- == 0)
                          break;
  
-                if (flags & LDLM_FL_LOCAL_ONLY) {
+                if (flags & LCF_LOCAL) {
                          rc = LDLM_FL_LOCAL_ONLY;
                          ldlm_lock_cancel(lock);
                  } else {
                          rc = ldlm_cli_cancel_local(lock);
                  }
-                if (!(flags & LDLM_FL_BL_AST) && (rc == LDLM_FL_BL_AST)) {
+                if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) {
                          LDLM_DEBUG(lock, "Cancel lock separately");
-                        list_del_init(&lock->l_bl_ast);
-                        list_add(&lock->l_bl_ast, &head);
+                        cfs_list_del_init(&lock->l_bl_ast);
+                        cfs_list_add(&lock->l_bl_ast, &head);
                          bl_ast ++;
                          continue;
                  }
                  if (rc == LDLM_FL_LOCAL_ONLY) {
                          /* CANCEL RPC should not be sent to server. */
-                        list_del_init(&lock->l_bl_ast);
+                        cfs_list_del_init(&lock->l_bl_ast);
                          LDLM_LOCK_RELEASE(lock);
                          count--;
                  }
@@ -1311,6 +1339,38 @@ static int ldlm_cancel_list(struct list_head *cancels, int count, int flags)
  
          RETURN(count);
  }
+EXPORT_SYMBOL(ldlm_cli_cancel_list_local);
+
+/**
+ * Cancel as many locks as possible w/o sending any rpcs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any rpcs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+                                                    struct ldlm_lock *lock,
+                                                    int unused, int added,
+                                                    int count)
+{
+        ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+        ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+        lock_res_and_lock(lock);
+
+        /* don't check added & count since we want to process all locks
+         * from unused list */
+        switch (lock->l_resource->lr_type) {
+                case LDLM_EXTENT:
+                case LDLM_IBITS:
+                        if (cb && cb(lock))
+                                break;
+                default:
+                        result = LDLM_POLICY_SKIP_LOCK;
+                        lock->l_flags |= LDLM_FL_SKIPPED;
+                        break;
+        }
+
+        unlock_res_and_lock(lock);
+        RETURN(result);
+}
  
  /**
   * Callback function for lru-resize policy. Makes decision whether to keep
@@ -1353,7 +1413,7 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns,
           * Inform pool about current CLV to see it via proc.
           */
          ldlm_pool_set_clv(pl, lv);
-        return (slv == 1 || lv < slv) ?
+        return (slv == 0 || lv < slv) ?
                  LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK;
  }
  
@@ -1433,6 +1493,9 @@ typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
  static ldlm_cancel_lru_policy_t
  ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
  {
+        if (flags & LDLM_CANCEL_NO_WAIT)
+                return ldlm_cancel_no_wait_policy;
+
          if (ns_connect_lru_resize(ns)) {
                  if (flags & LDLM_CANCEL_SHRINK)
                          /* We kill passed number of old locks. */
@@ -1474,17 +1537,23 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
   *                              memory pressre policy function;
   *
   * flags & LDLM_CANCEL_AGED -   cancel alocks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *                               (typically before replaying locks) w/o
+ *                               sending any rpcs or waiting for any
+ *                               outstanding rpc to complete.
   */
-int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
-                          int count, int max, int cancel_flags, int flags)
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels,
+                                 int count, int max, int flags)
  {
          ldlm_cancel_lru_policy_t pf;
          struct ldlm_lock *lock, *next;
-        int added = 0, unused;
+        int added = 0, unused, remained;
          ENTRY;
  
-        spin_lock(&ns->ns_unused_lock);
+        cfs_spin_lock(&ns->ns_lock);
          unused = ns->ns_nr_unused;
+        remained = unused;
  
          if (!ns_connect_lru_resize(ns))
                  count += unused - ns->ns_max_unused;
@@ -1492,15 +1561,27 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
          pf = ldlm_cancel_lru_policy(ns, flags);
          LASSERT(pf != NULL);
  
-        while (!list_empty(&ns->ns_unused_list)) {
+        while (!cfs_list_empty(&ns->ns_unused_list)) {
+                ldlm_policy_res_t result;
+
+                /* all unused locks */
+                if (remained-- <= 0)
+                        break;
+
                  /* For any flags, stop scanning if @max is reached. */
                  if (max && added >= max)
                          break;
  
-                list_for_each_entry_safe(lock, next, &ns->ns_unused_list, l_lru){
+                cfs_list_for_each_entry_safe(lock, next, &ns->ns_unused_list,
+                                             l_lru){
                          /* No locks which got blocking requests. */
                          LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
  
+                        if (flags & LDLM_CANCEL_NO_WAIT &&
+                            lock->l_flags & LDLM_FL_SKIPPED)
+                                /* already processed */
+                                continue;
+
                          /* Somebody is already doing CANCEL. No need in this
                           * lock in lru, do not traverse it again. */
                          if (!(lock->l_flags & LDLM_FL_CANCELING))
@@ -1512,7 +1593,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
                          break;
  
                  LDLM_LOCK_GET(lock);
-                spin_unlock(&ns->ns_unused_lock);
+                cfs_spin_unlock(&ns->ns_lock);
                  lu_ref_add(&lock->l_reference, __FUNCTION__, cfs_current());
  
                  /* Pass the lock through the policy filter and see if it
@@ -1528,14 +1609,21 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
                   * old locks, but additionally chose them by
                   * their weight. Big extent locks will stay in
                   * the cache. */
-                if (pf(ns, lock, unused, added, count) ==
-                    LDLM_POLICY_KEEP_LOCK) {
+                result = pf(ns, lock, unused, added, count);
+                if (result == LDLM_POLICY_KEEP_LOCK) {
                          lu_ref_del(&lock->l_reference,
                                     __FUNCTION__, cfs_current());
                          LDLM_LOCK_RELEASE(lock);
-                        spin_lock(&ns->ns_unused_lock);
+                        cfs_spin_lock(&ns->ns_lock);
                          break;
                  }
+                if (result == LDLM_POLICY_SKIP_LOCK) {
+                        lu_ref_del(&lock->l_reference,
+                                   __FUNCTION__, cfs_current());
+                        LDLM_LOCK_RELEASE(lock);
+                        cfs_spin_lock(&ns->ns_lock);
+                        continue;
+                }
  
                  lock_res_and_lock(lock);
                  /* Check flags again under the lock. */
@@ -1550,7 +1638,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
                          lu_ref_del(&lock->l_reference,
                                     __FUNCTION__, cfs_current());
                          LDLM_LOCK_RELEASE(lock);
-                        spin_lock(&ns->ns_unused_lock);
+                        cfs_spin_lock(&ns->ns_lock);
                          continue;
                  }
                  LASSERT(!lock->l_readers && !lock->l_writers);
@@ -1572,27 +1660,38 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
  
                  /* We can't re-add to l_lru as it confuses the
                   * refcounting in ldlm_lock_remove_from_lru() if an AST
-                 * arrives after we drop ns_lock below. We use l_bl_ast
+                 * arrives after we drop lr_lock below. We use l_bl_ast
                   * and can't use l_pending_chain as it is used both on
                   * server and client nevertheless bug 5666 says it is
                   * used only on server */
-                LASSERT(list_empty(&lock->l_bl_ast));
-                list_add(&lock->l_bl_ast, cancels);
+                LASSERT(cfs_list_empty(&lock->l_bl_ast));
+                cfs_list_add(&lock->l_bl_ast, cancels);
                  unlock_res_and_lock(lock);
                  lu_ref_del(&lock->l_reference, __FUNCTION__, cfs_current());
-                spin_lock(&ns->ns_unused_lock);
+                cfs_spin_lock(&ns->ns_lock);
                  added++;
                  unused--;
          }
-        spin_unlock(&ns->ns_unused_lock);
-        RETURN(ldlm_cancel_list(cancels, added, cancel_flags));
+        cfs_spin_unlock(&ns->ns_lock);
+        RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
+                          int count, int max, ldlm_cancel_flags_t cancel_flags,
+                          int flags)
+{
+        int added;
+        added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+        if (added <= 0)
+                return added;
+        return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
  }
  
  /* when called with LDLM_ASYNC the blocking callback will be handled
   * in a thread and this function will return after the thread has been
   * asked to call the callback.  when called with LDLM_SYNC the blocking
   * callback will be performed in this function. */
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t mode,
                      int flags)
  {
          CFS_LIST_HEAD(cancels);
@@ -1600,36 +1699,33 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
          ENTRY;
  
  #ifndef __KERNEL__
-        sync = LDLM_SYNC; /* force to be sync in user space */
+        mode = LDLM_SYNC; /* force to be sync in user space */
  #endif
-        count = ldlm_cancel_lru_local(ns, &cancels, nr, 0, 0, flags);
-        if (sync == LDLM_ASYNC) {
-                rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count);
-                if (rc == 0)
-                        RETURN(count);
-        }
+        /* Just prepare the list of locks, do not actually cancel them yet.
+         * Locks are cancelled later in a separate thread. */
+        count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+        rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, mode);
+        if (rc == 0)
+                RETURN(count);
  
-        /* If an error occured in ASYNC mode, or this is SYNC mode,
-         * cancel the list. */
-        ldlm_cli_cancel_list(&cancels, count, NULL, 0);
-        RETURN(count);
+        RETURN(0);
  }
  
  /* Find and cancel locally unused locks found on resource, matched to the
   * given policy, mode. GET the found locks and add them into the @cancels
   * list. */
  int ldlm_cancel_resource_local(struct ldlm_resource *res,
-                               struct list_head *cancels,
+                               cfs_list_t *cancels,
                                 ldlm_policy_data_t *policy,
                                 ldlm_mode_t mode, int lock_flags,
-                               int cancel_flags, void *opaque)
+                               ldlm_cancel_flags_t cancel_flags, void *opaque)
  {
          struct ldlm_lock *lock;
          int count = 0;
          ENTRY;
  
          lock_res(res);
-        list_for_each_entry(lock, &res->lr_granted, l_res_link) {
+        cfs_list_for_each_entry(lock, &res->lr_granted, l_res_link) {
                  if (opaque != NULL && lock->l_ast_data != opaque) {
                          LDLM_ERROR(lock, "data %p doesn't match opaque %p",
                                     lock->l_ast_data, opaque);
@@ -1637,13 +1733,8 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
                          continue;
                  }
  
-                if (lock->l_readers || lock->l_writers) {
-                        if (cancel_flags & LDLM_FL_WARN) {
-                                LDLM_ERROR(lock, "lock in use");
-                                //LBUG();
-                        }
+                if (lock->l_readers || lock->l_writers)
                          continue;
-                }
  
                  /* If somebody is already doing CANCEL, or blocking ast came,
                   * skip this lock. */
@@ -1665,15 +1756,16 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
                  lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING |
                                   lock_flags;
  
-                LASSERT(list_empty(&lock->l_bl_ast));
-                list_add(&lock->l_bl_ast, cancels);
+                LASSERT(cfs_list_empty(&lock->l_bl_ast));
+                cfs_list_add(&lock->l_bl_ast, cancels);
                  LDLM_LOCK_GET(lock);
                  count++;
          }
          unlock_res(res);
  
-        RETURN(ldlm_cancel_list(cancels, count, cancel_flags));
+        RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
  }
+EXPORT_SYMBOL(ldlm_cancel_resource_local);
  
  /* If @req is NULL, send CANCEL request to server with handles of locks
   * in the @cancels. If EARLY_CANCEL is not supported, send CANCEL requests
@@ -1681,14 +1773,14 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
   * If @req is not NULL, put handles of locks in @cancels into the request
   * buffer at the offset @off.
   * Destroy @cancels at the end. */
-int ldlm_cli_cancel_list(struct list_head *cancels, int count,
-                         struct ptlrpc_request *req, int flags)
+int ldlm_cli_cancel_list(cfs_list_t *cancels, int count,
+                         struct ptlrpc_request *req, ldlm_cancel_flags_t flags)
  {
          struct ldlm_lock *lock;
          int res = 0;
          ENTRY;
  
-        if (list_empty(cancels) || count == 0)
+        if (cfs_list_empty(cancels) || count == 0)
                  RETURN(0);
  
          /* XXX: requests (both batched and not) could be sent in parallel.
@@ -1697,8 +1789,9 @@ int ldlm_cli_cancel_list(struct list_head *cancels, int count,
           * It would also speed up the case when the server does not support
           * the feature. */
          while (count > 0) {
-                LASSERT(!list_empty(cancels));
-                lock = list_entry(cancels->next, struct ldlm_lock, l_bl_ast);
+                LASSERT(!cfs_list_empty(cancels));
+                lock = cfs_list_entry(cancels->next, struct ldlm_lock,
+                                      l_bl_ast);
                  LASSERT(lock->l_conn_export);
  
                  if (exp_connect_cancelset(lock->l_conn_export)) {
@@ -1725,11 +1818,14 @@ int ldlm_cli_cancel_list(struct list_head *cancels, int count,
          LASSERT(count == 0);
          RETURN(0);
  }
+EXPORT_SYMBOL(ldlm_cli_cancel_list);
  
  int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
                                      const struct ldlm_res_id *res_id,
                                      ldlm_policy_data_t *policy,
-                                    ldlm_mode_t mode, int flags, void *opaque)
+                                    ldlm_mode_t mode,
+                                    ldlm_cancel_flags_t flags,
+                                    void *opaque)
  {
          struct ldlm_resource *res;
          CFS_LIST_HEAD(cancels);
@@ -1746,7 +1842,7 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
  
          LDLM_RESOURCE_ADDREF(res);
          count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
-                                           0, flags, opaque);
+                                           0, flags | LCF_BL_AST, opaque);
          rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
          if (rc != ELDLM_OK)
                  CERROR("ldlm_cli_cancel_unused_resource: %d\n", rc);
@@ -1755,77 +1851,68 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
          ldlm_resource_putref(res);
          RETURN(0);
  }
+EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource);
  
-static inline int have_no_nsresource(struct ldlm_namespace *ns)
-{
-        int no_resource = 0;
-
-        spin_lock(&ns->ns_hash_lock);
-        if (ns->ns_resources == 0)
-                no_resource = 1;
-        spin_unlock(&ns->ns_hash_lock);
+struct ldlm_cli_cancel_arg {
+        int     lc_flags;
+        void   *lc_opaque;
+};
  
-        RETURN(no_resource);
+static int ldlm_cli_hash_cancel_unused(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                       cfs_hlist_node_t *hnode, void *arg)
+{
+        struct ldlm_resource           *res = cfs_hash_object(hs, hnode);
+        struct ldlm_cli_cancel_arg     *lc = arg;
+        int                             rc;
+
+        rc = ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+                                             NULL, LCK_MINMODE,
+                                             lc->lc_flags, lc->lc_opaque);
+        if (rc != 0) {
+                CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
+                       res->lr_name.name[0], rc);
+        }
+        /* must return 0 for hash iteration */
+        return 0;
  }
  
  /* Cancel all locks on a namespace (or a specific resource, if given)
   * that have 0 readers/writers.
   *
- * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
+ * If flags & LCF_LOCAL, throw the locks away without trying
   * to notify the server. */
  int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
                             const struct ldlm_res_id *res_id,
-                           int flags, void *opaque)
+                           ldlm_cancel_flags_t flags, void *opaque)
  {
-        int i;
+        struct ldlm_cli_cancel_arg arg = {
+                .lc_flags       = flags,
+                .lc_opaque      = opaque,
+        };
+
          ENTRY;
  
          if (ns == NULL)
                  RETURN(ELDLM_OK);
  
-        if (res_id)
+        if (res_id != NULL) {
                  RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
                                                         LCK_MINMODE, flags,
                                                         opaque));
-
-        spin_lock(&ns->ns_hash_lock);
-        for (i = 0; i < RES_HASH_SIZE; i++) {
-                struct list_head *tmp;
-                tmp = ns->ns_hash[i].next;
-                while (tmp != &(ns->ns_hash[i])) {
-                        struct ldlm_resource *res;
-                        int rc;
-
-                        res = list_entry(tmp, struct ldlm_resource, lr_hash);
-                        ldlm_resource_getref(res);
-                        spin_unlock(&ns->ns_hash_lock);
-
-                        LDLM_RESOURCE_ADDREF(res);
-                        rc = ldlm_cli_cancel_unused_resource(ns, &res->lr_name,
-                                                             NULL, LCK_MINMODE,
-                                                             flags, opaque);
-
-                        if (rc)
-                                CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
-                                       res->lr_name.name[0], rc);
-
-                        LDLM_RESOURCE_DELREF(res);
-                        spin_lock(&ns->ns_hash_lock);
-                        tmp = tmp->next;
-                        ldlm_resource_putref_locked(res);
-                }
+        } else {
+                cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                         ldlm_cli_hash_cancel_unused, &arg);
+                RETURN(ELDLM_OK);
          }
-        spin_unlock(&ns->ns_hash_lock);
-
-        RETURN(ELDLM_OK);
  }
+EXPORT_SYMBOL(ldlm_cli_cancel_unused);
  
  /* Lock iterators. */
  
  int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
                            void *closure)
  {
-        struct list_head *tmp, *next;
+        cfs_list_t *tmp, *next;
          struct ldlm_lock *lock;
          int rc = LDLM_ITER_CONTINUE;
  
@@ -1835,22 +1922,22 @@ int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
                  RETURN(LDLM_ITER_CONTINUE);
  
          lock_res(res);
-        list_for_each_safe(tmp, next, &res->lr_granted) {
-                lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+        cfs_list_for_each_safe(tmp, next, &res->lr_granted) {
+                lock = cfs_list_entry(tmp, struct ldlm_lock, l_res_link);
  
                  if (iter(lock, closure) == LDLM_ITER_STOP)
                          GOTO(out, rc = LDLM_ITER_STOP);
          }
  
-        list_for_each_safe(tmp, next, &res->lr_converting) {
-                lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+        cfs_list_for_each_safe(tmp, next, &res->lr_converting) {
+                lock = cfs_list_entry(tmp, struct ldlm_lock, l_res_link);
  
                  if (iter(lock, closure) == LDLM_ITER_STOP)
                          GOTO(out, rc = LDLM_ITER_STOP);
          }
  
-        list_for_each_safe(tmp, next, &res->lr_waiting) {
-                lock = list_entry(tmp, struct ldlm_lock, l_res_link);
+        cfs_list_for_each_safe(tmp, next, &res->lr_waiting) {
+                lock = cfs_list_entry(tmp, struct ldlm_lock, l_res_link);
  
                  if (iter(lock, closure) == LDLM_ITER_STOP)
                          GOTO(out, rc = LDLM_ITER_STOP);
@@ -1859,6 +1946,7 @@ int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter,
          unlock_res(res);
          RETURN(rc);
  }
+EXPORT_SYMBOL(ldlm_resource_foreach);
  
  struct iter_helper_data {
          ldlm_iterator_t iter;
@@ -1871,56 +1959,39 @@ static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
          return helper->iter(lock, helper->closure);
  }
  
-static int ldlm_res_iter_helper(struct ldlm_resource *res, void *closure)
+static int ldlm_res_iter_helper(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                cfs_hlist_node_t *hnode, void *arg)
+
  {
-        return ldlm_resource_foreach(res, ldlm_iter_helper, closure);
+        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+        return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+               LDLM_ITER_STOP;
  }
  
-int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
-                           void *closure)
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+                            ldlm_iterator_t iter, void *closure)
+
  {
          struct iter_helper_data helper = { iter: iter, closure: closure };
-        return ldlm_namespace_foreach_res(ns, ldlm_res_iter_helper, &helper);
-}
  
-int ldlm_namespace_foreach_res(struct ldlm_namespace *ns,
-                               ldlm_res_iterator_t iter, void *closure)
-{
-        int i, rc = LDLM_ITER_CONTINUE;
-        struct ldlm_resource *res;
-        struct list_head *tmp;
+        cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                 ldlm_res_iter_helper, &helper);
  
-        ENTRY;
-        spin_lock(&ns->ns_hash_lock);
-        for (i = 0; i < RES_HASH_SIZE; i++) {
-                tmp = ns->ns_hash[i].next;
-                while (tmp != &(ns->ns_hash[i])) {
-                        res = list_entry(tmp, struct ldlm_resource, lr_hash);
-                        ldlm_resource_getref(res);
-                        spin_unlock(&ns->ns_hash_lock);
-                        LDLM_RESOURCE_ADDREF(res);
-
-                        rc = iter(res, closure);
-
-                        LDLM_RESOURCE_DELREF(res);
-                        spin_lock(&ns->ns_hash_lock);
-                        tmp = tmp->next;
-                        ldlm_resource_putref_locked(res);
-                        if (rc == LDLM_ITER_STOP)
-                                GOTO(out, rc);
-                }
-        }
- out:
-        spin_unlock(&ns->ns_hash_lock);
-        RETURN(rc);
  }
+EXPORT_SYMBOL(ldlm_namespace_foreach);
  
-/* non-blocking function to manipulate a lock whose cb_data is being put away.*/
-void ldlm_resource_iterate(struct ldlm_namespace *ns,
-                           const struct ldlm_res_id *res_id,
-                           ldlm_iterator_t iter, void *data)
+/* non-blocking function to manipulate a lock whose cb_data is being put away.
+ * return  0:  find no resource
+ *       > 0:  must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE.
+ *       < 0:  errors
+ */
+int ldlm_resource_iterate(struct ldlm_namespace *ns,
+                          const struct ldlm_res_id *res_id,
+                          ldlm_iterator_t iter, void *data)
  {
          struct ldlm_resource *res;
+        int rc;
          ENTRY;
  
          if (ns == NULL) {
@@ -1929,30 +2000,36 @@ void ldlm_resource_iterate(struct ldlm_namespace *ns,
          }
  
          res = ldlm_resource_get(ns, NULL, res_id, 0, 0);
-        if (res == NULL) {
-                EXIT;
-                return;
-        }
+        if (res == NULL)
+                RETURN(0);
  
          LDLM_RESOURCE_ADDREF(res);
-        ldlm_resource_foreach(res, iter, data);
+        rc = ldlm_resource_foreach(res, iter, data);
          LDLM_RESOURCE_DELREF(res);
          ldlm_resource_putref(res);
-        EXIT;
+        RETURN(rc);
  }
+EXPORT_SYMBOL(ldlm_resource_iterate);
  
  /* Lock replay */
  
  static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure)
  {
-        struct list_head *list = closure;
+        cfs_list_t *list = closure;
  
          /* we use l_pending_chain here, because it's unused on clients. */
-        LASSERTF(list_empty(&lock->l_pending_chain),"lock %p next %p prev %p\n",
+        LASSERTF(cfs_list_empty(&lock->l_pending_chain),
+                 "lock %p next %p prev %p\n",
                   lock, &lock->l_pending_chain.next,&lock->l_pending_chain.prev);
-        /* bug 9573: don't replay locks left after eviction */
-        if (!(lock->l_flags & LDLM_FL_FAILED))
-                list_add(&lock->l_pending_chain, list);
+        /* bug 9573: don't replay locks left after eviction, or
+         * bug 17614: locks being actively cancelled. Get a reference
+         * on a lock so that it does not disapear under us (e.g. due to cancel)
+         */
+        if (!(lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_CANCELING))) {
+                cfs_list_add(&lock->l_pending_chain, list);
+                LDLM_LOCK_GET(lock);
+        }
+
          return LDLM_ITER_CONTINUE;
  }
  
@@ -1960,13 +2037,12 @@ static int replay_lock_interpret(const struct lu_env *env,
                                   struct ptlrpc_request *req,
                                   struct ldlm_async_args *aa, int rc)
  {
-        struct lustre_handle  old_hash_key;
          struct ldlm_lock     *lock;
          struct ldlm_reply    *reply;
          struct obd_export    *exp;
  
          ENTRY;
-        atomic_dec(&req->rq_import->imp_replay_inflight);
+        cfs_atomic_dec(&req->rq_import->imp_replay_inflight);
          if (rc != ELDLM_OK)
                  GOTO(out, rc);
  
@@ -1985,22 +2061,26 @@ static int replay_lock_interpret(const struct lu_env *env,
                  GOTO(out, rc = -ESTALE);
          }
  
-        old_hash_key = lock->l_remote_handle;
-        lock->l_remote_handle = reply->lock_handle;
-
          /* Key change rehash lock in per-export hash with new key */
          exp = req->rq_export;
-        if (exp && exp->exp_lock_hash)
-                lustre_hash_rehash_key(exp->exp_lock_hash, &old_hash_key,
-                                       &lock->l_remote_handle,
-                                       &lock->l_exp_hash);
+        if (exp && exp->exp_lock_hash) {
+               /* In the function below, .hs_keycmp resolves to
+                * ldlm_export_lock_keycmp() */
+               /* coverity[overrun-buffer-val] */
+                cfs_hash_rehash_key(exp->exp_lock_hash,
+                                    &lock->l_remote_handle,
+                                    &reply->lock_handle,
+                                    &lock->l_exp_hash);
+        } else {
+                lock->l_remote_handle = reply->lock_handle;
+        }
  
          LDLM_DEBUG(lock, "replayed lock:");
          ptlrpc_import_recovery_state_machine(req->rq_import);
          LDLM_LOCK_PUT(lock);
  out:
          if (rc != ELDLM_OK)
-                ptlrpc_connect_import(req->rq_import, NULL);
+                ptlrpc_connect_import(req->rq_import);
  
          RETURN(rc);
  }
@@ -2046,7 +2126,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
                  flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED;
          else if (lock->l_granted_mode)
                  flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV;
-        else if (!list_empty(&lock->l_res_link))
+        else if (!cfs_list_empty(&lock->l_res_link))
                  flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT;
          else
                  flags = LDLM_FL_REPLAY;
@@ -2078,16 +2158,45 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
  
          LDLM_DEBUG(lock, "replaying lock:");
  
-        atomic_inc(&req->rq_import->imp_replay_inflight);
+        cfs_atomic_inc(&req->rq_import->imp_replay_inflight);
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
          aa = ptlrpc_req_async_args(req);
          aa->lock_handle = body->lock_handle[0];
          req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
-        ptlrpcd_add_req(req, PSCOPE_OTHER);
+        ptlrpcd_add_req(req, PDL_POLICY_LOCAL, -1);
  
          RETURN(0);
  }
  
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+        int canceled;
+        CFS_LIST_HEAD(cancels);
+
+        CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+                           "replay for namespace %s (%d)\n",
+                           ldlm_ns_name(ns), ns->ns_nr_unused);
+
+        /* We don't need to care whether or not LRU resize is enabled
+         * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+         * count parameter */
+        canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+                                         LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+        CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+                           canceled, ldlm_ns_name(ns));
+}
+
  int ldlm_replay_locks(struct obd_import *imp)
  {
          struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
@@ -2097,21 +2206,32 @@ int ldlm_replay_locks(struct obd_import *imp)
  
          ENTRY;
  
-        LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
+        LASSERT(cfs_atomic_read(&imp->imp_replay_inflight) == 0);
+
+        /* don't replay locks if import failed recovery */
+        if (imp->imp_vbr_failed)
+                RETURN(0);
  
          /* ensure this doesn't fall to 0 before all have been queued */
-        atomic_inc(&imp->imp_replay_inflight);
+        cfs_atomic_inc(&imp->imp_replay_inflight);
+
+        if (ldlm_cancel_unused_locks_before_replay)
+                ldlm_cancel_unused_locks_for_replay(ns);
  
-        (void)ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+        ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
  
-        list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
-                list_del_init(&lock->l_pending_chain);
-                if (rc)
+        cfs_list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
+                cfs_list_del_init(&lock->l_pending_chain);
+                if (rc) {
+                        LDLM_LOCK_RELEASE(lock);
                          continue; /* or try to do the rest? */
+                }
                  rc = replay_one_lock(imp, lock);
+                LDLM_LOCK_RELEASE(lock);
          }
  
-        atomic_dec(&imp->imp_replay_inflight);
+        cfs_atomic_dec(&imp->imp_replay_inflight);
  
          RETURN(rc);
  }
+EXPORT_SYMBOL(ldlm_replay_locks);