b=24375 Fix lvb updating race in enqueue vs completion case

[fs/lustre-release.git] / lustre / ldlm / ldlm_request.c
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index f287f1a..5d718c9 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -26,7 +26,7 @@
   * GPL HEADER END
   */
  /*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
   * Use is subject to license terms.
   */
  /*
@@ -50,6 +50,9 @@ int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
  CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
                  "lock enqueue timeout minimum");
  
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
  static void interrupted_completion_wait(void *data)
  {
  }
@@ -87,7 +90,7 @@ int ldlm_expired_completion_wait(void *data)
                          last_dump = next_dump;
                          next_dump = cfs_time_shift(300);
                          ldlm_namespace_dump(D_DLMTRACE,
-                                            lock->l_resource->lr_namespace);
+                                            ldlm_lock_to_ns(lock));
                          if (last_dump == 0)
                                  libcfs_debug_dumplog();
                  }
@@ -110,7 +113,7 @@ int ldlm_expired_completion_wait(void *data)
     from a single node. */
  int ldlm_get_enq_timeout(struct ldlm_lock *lock)
  {
-        int timeout = at_get(&lock->l_resource->lr_namespace->ns_at_estimate);
+        int timeout = at_get(ldlm_lock_to_ns_at(lock));
          if (AT_OFF)
                  return obd_timeout / 2;
          /* Since these are non-updating timeouts, we should be conservative.
@@ -140,7 +143,7 @@ static int ldlm_completion_tail(struct ldlm_lock *lock)
                             CFS_DURATION_T"s", delay);
  
                  /* Update our time estimate */
-                at_measured(&lock->l_resource->lr_namespace->ns_at_estimate,
+                at_measured(ldlm_lock_to_ns_at(lock),
                              delay);
                  result = 0;
          }
@@ -252,7 +255,7 @@ noreproc:
                  cfs_spin_unlock(&imp->imp_lock);
          }
  
-        if (ns_is_client(lock->l_resource->lr_namespace) &&
+        if (ns_is_client(ldlm_lock_to_ns(lock)) &&
              OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
                                   OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
                  lock->l_flags |= LDLM_FL_FAIL_LOC;
@@ -334,7 +337,7 @@ int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
          lock_res_and_lock(lock);
          /* Get this: if ldlm_blocking_ast is racing with intent_policy, such
           * that ldlm_blocking_ast is called just before intent_policy method
-         * takes the ns_lock, then by the time we get the lock, we might not
+         * takes the lr_lock, then by the time we get the lock, we might not
           * be the correct blocking function anymore.  So check, and return
           * early, if so. */
          if (lock->l_blocking_ast != ldlm_blocking_ast) {
@@ -400,13 +403,15 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
          if (unlikely(!lock))
                  GOTO(out_nolock, err = -ENOMEM);
  
-        ldlm_lock_addref_internal(lock, mode);
          ldlm_lock2handle(lock, lockh);
-        lock_res_and_lock(lock);
+
+        /* NB: we don't have any lock now (lock_res_and_lock)
+         * because it's a new lock */
+        ldlm_lock_addref_internal_nolock(lock, mode);
          lock->l_flags |= LDLM_FL_LOCAL;
          if (*flags & LDLM_FL_ATOMIC_CB)
                  lock->l_flags |= LDLM_FL_ATOMIC_CB;
-        unlock_res_and_lock(lock);
+
          if (policy != NULL)
                  lock->l_policy_data = *policy;
          if (client_cookie != NULL)
@@ -481,9 +486,9 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
  {
          struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
          int is_replay = *flags & LDLM_FL_REPLAY;
-        struct lustre_handle old_hash_key;
          struct ldlm_lock *lock;
          struct ldlm_reply *reply;
+        struct ost_lvb *tmplvb;
          int cleanup_phase = 1;
          ENTRY;
  
@@ -505,12 +510,11 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
                          if (reply == NULL)
                                  rc = -EPROTO;
                          if (lvb_len) {
-                                struct ost_lvb *tmplvb;
  
                                  req_capsule_set_size(&req->rq_pill,
                                                       &RMF_DLM_LVB, RCL_SERVER,
                                                       lvb_len);
-                            tmplvb = req_capsule_server_get(&req->rq_pill,
+                                tmplvb = req_capsule_server_get(&req->rq_pill,
                                                                   &RMF_DLM_LVB);
                                  if (tmplvb == NULL)
                                          GOTO(cleanup, rc = -EPROTO);
@@ -529,14 +533,15 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
          cleanup_phase = 0;
  
          lock_res_and_lock(lock);
-        old_hash_key = lock->l_remote_handle;
-        lock->l_remote_handle = reply->lock_handle;
-
          /* Key change rehash lock in per-export hash with new key */
-        if (exp->exp_lock_hash)
-                cfs_hash_rehash_key(exp->exp_lock_hash, &old_hash_key,
+        if (exp->exp_lock_hash) {
+                cfs_hash_rehash_key(exp->exp_lock_hash,
                                      &lock->l_remote_handle,
+                                    &reply->lock_handle,
                                      &lock->l_exp_hash);
+        } else {
+                lock->l_remote_handle = reply->lock_handle;
+        }
  
          *flags = reply->lock_flags;
          lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS;
@@ -601,16 +606,25 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
  
          /* If the lock has already been granted by a completion AST, don't
           * clobber the LVB with an older one. */
-        if (lvb_len && (lock->l_req_mode != lock->l_granted_mode)) {
-                void *tmplvb;
+        if (lvb_len) {
+                /* We must lock or a racing completion might update lvb
+                   without letting us know and we'll clobber the correct value.
+                   Cannot unlock after the check either, a that still leaves
+                   a tiny window for completion to get in */
+                lock_res_and_lock(lock);
+                if (lock->l_req_mode != lock->l_granted_mode) {
  
-                req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
-                                     lvb_len);
-                tmplvb = req_capsule_server_get(&req->rq_pill,
-                                                     &RMF_DLM_LVB);
-                if (tmplvb == NULL)
-                        GOTO(cleanup, rc = -EPROTO);
-                memcpy(lock->l_lvb_data, tmplvb, lvb_len);
+                        req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB,
+                                             RCL_SERVER, lvb_len);
+                        tmplvb = req_capsule_server_get(&req->rq_pill,
+                                                             &RMF_DLM_LVB);
+                        if (tmplvb == NULL) {
+                                unlock_res_and_lock(lock);
+                                GOTO(cleanup, rc = -EPROTO);
+                        }
+                        memcpy(lock->l_lvb_data, tmplvb, lvb_len);
+                }
+                unlock_res_and_lock(lock);
          }
  
          if (!is_replay) {
@@ -902,7 +916,7 @@ static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
          struct ldlm_resource *res;
          int rc;
          ENTRY;
-        if (ns_is_client(lock->l_resource->lr_namespace)) {
+        if (ns_is_client(ldlm_lock_to_ns(lock))) {
                  CERROR("Trying to cancel local lock\n");
                  LBUG();
          }
@@ -1025,7 +1039,7 @@ static int ldlm_cli_cancel_local(struct ldlm_lock *lock)
                  }
                  ldlm_lock_cancel(lock);
          } else {
-                if (ns_is_client(lock->l_resource->lr_namespace)) {
+                if (ns_is_client(ldlm_lock_to_ns(lock))) {
                          LDLM_ERROR(lock, "Trying to cancel local lock");
                          LBUG();
                  }
@@ -1257,7 +1271,7 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
                                                    RCL_CLIENT, 0);
                  LASSERT(avail > 0);
  
-                ns = lock->l_resource->lr_namespace;
+                ns = ldlm_lock_to_ns(lock);
                  flags = ns_connect_lru_resize(ns) ?
                          LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
                  count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
@@ -1269,8 +1283,8 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
  
  /* XXX until we will have compound requests and can cut cancels from generic rpc
   * we need send cancels with LDLM_FL_BL_AST flag as separate rpc */
-static int ldlm_cancel_list(cfs_list_t *cancels, int count,
-                            ldlm_cancel_flags_t flags)
+int ldlm_cli_cancel_list_local(cfs_list_t *cancels, int count,
+                               ldlm_cancel_flags_t flags)
  {
          CFS_LIST_HEAD(head);
          struct ldlm_lock *lock, *next;
@@ -1311,6 +1325,37 @@ static int ldlm_cancel_list(cfs_list_t *cancels, int count,
  }
  
  /**
+ * Cancel as many locks as possible w/o sending any rpcs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any rpcs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+                                                    struct ldlm_lock *lock,
+                                                    int unused, int added,
+                                                    int count)
+{
+        ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+        ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+        lock_res_and_lock(lock);
+
+        /* don't check added & count since we want to process all locks
+         * from unused list */
+        switch (lock->l_resource->lr_type) {
+                case LDLM_EXTENT:
+                case LDLM_IBITS:
+                        if (cb && cb(lock))
+                                break;
+                default:
+                        result = LDLM_POLICY_SKIP_LOCK;
+                        lock->l_flags |= LDLM_FL_SKIPPED;
+                        break;
+        }
+
+        unlock_res_and_lock(lock);
+        RETURN(result);
+}
+
+/**
   * Callback function for lru-resize policy. Makes decision whether to keep
   * \a lock in LRU for current \a LRU size \a unused, added in current scan
   * \a added and number of locks to be preferably canceled \a count.
@@ -1431,6 +1476,9 @@ typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
  static ldlm_cancel_lru_policy_t
  ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
  {
+        if (flags & LDLM_CANCEL_NO_WAIT)
+                return ldlm_cancel_no_wait_policy;
+
          if (ns_connect_lru_resize(ns)) {
                  if (flags & LDLM_CANCEL_SHRINK)
                          /* We kill passed number of old locks. */
@@ -1472,18 +1520,23 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
   *                              memory pressre policy function;
   *
   * flags & LDLM_CANCEL_AGED -   cancel alocks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *                               (typically before replaying locks) w/o
+ *                               sending any rpcs or waiting for any
+ *                               outstanding rpc to complete.
   */
-int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
-                          int count, int max, ldlm_cancel_flags_t cancel_flags,
-                          int flags)
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels,
+                                 int count, int max, int flags)
  {
          ldlm_cancel_lru_policy_t pf;
          struct ldlm_lock *lock, *next;
-        int added = 0, unused;
+        int added = 0, unused, remained;
          ENTRY;
  
-        cfs_spin_lock(&ns->ns_unused_lock);
+        cfs_spin_lock(&ns->ns_lock);
          unused = ns->ns_nr_unused;
+        remained = unused;
  
          if (!ns_connect_lru_resize(ns))
                  count += unused - ns->ns_max_unused;
@@ -1492,6 +1545,12 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
          LASSERT(pf != NULL);
  
          while (!cfs_list_empty(&ns->ns_unused_list)) {
+                ldlm_policy_res_t result;
+
+                /* all unused locks */
+                if (remained-- <= 0)
+                        break;
+
                  /* For any flags, stop scanning if @max is reached. */
                  if (max && added >= max)
                          break;
@@ -1501,6 +1560,11 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                          /* No locks which got blocking requests. */
                          LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
  
+                        if (flags & LDLM_CANCEL_NO_WAIT &&
+                            lock->l_flags & LDLM_FL_SKIPPED)
+                                /* already processed */
+                                continue;
+
                          /* Somebody is already doing CANCEL. No need in this
                           * lock in lru, do not traverse it again. */
                          if (!(lock->l_flags & LDLM_FL_CANCELING))
@@ -1512,7 +1576,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                          break;
  
                  LDLM_LOCK_GET(lock);
-                cfs_spin_unlock(&ns->ns_unused_lock);
+                cfs_spin_unlock(&ns->ns_lock);
                  lu_ref_add(&lock->l_reference, __FUNCTION__, cfs_current());
  
                  /* Pass the lock through the policy filter and see if it
@@ -1528,14 +1592,21 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                   * old locks, but additionally chose them by
                   * their weight. Big extent locks will stay in
                   * the cache. */
-                if (pf(ns, lock, unused, added, count) ==
-                    LDLM_POLICY_KEEP_LOCK) {
+                result = pf(ns, lock, unused, added, count);
+                if (result == LDLM_POLICY_KEEP_LOCK) {
                          lu_ref_del(&lock->l_reference,
                                     __FUNCTION__, cfs_current());
                          LDLM_LOCK_RELEASE(lock);
-                        cfs_spin_lock(&ns->ns_unused_lock);
+                        cfs_spin_lock(&ns->ns_lock);
                          break;
                  }
+                if (result == LDLM_POLICY_SKIP_LOCK) {
+                        lu_ref_del(&lock->l_reference,
+                                   __FUNCTION__, cfs_current());
+                        LDLM_LOCK_RELEASE(lock);
+                        cfs_spin_lock(&ns->ns_lock);
+                        continue;
+                }
  
                  lock_res_and_lock(lock);
                  /* Check flags again under the lock. */
@@ -1550,7 +1621,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                          lu_ref_del(&lock->l_reference,
                                     __FUNCTION__, cfs_current());
                          LDLM_LOCK_RELEASE(lock);
-                        cfs_spin_lock(&ns->ns_unused_lock);
+                        cfs_spin_lock(&ns->ns_lock);
                          continue;
                  }
                  LASSERT(!lock->l_readers && !lock->l_writers);
@@ -1572,7 +1643,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
  
                  /* We can't re-add to l_lru as it confuses the
                   * refcounting in ldlm_lock_remove_from_lru() if an AST
-                 * arrives after we drop ns_lock below. We use l_bl_ast
+                 * arrives after we drop lr_lock below. We use l_bl_ast
                   * and can't use l_pending_chain as it is used both on
                   * server and client nevertheless bug 5666 says it is
                   * used only on server */
@@ -1580,19 +1651,30 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                  cfs_list_add(&lock->l_bl_ast, cancels);
                  unlock_res_and_lock(lock);
                  lu_ref_del(&lock->l_reference, __FUNCTION__, cfs_current());
-                cfs_spin_lock(&ns->ns_unused_lock);
+                cfs_spin_lock(&ns->ns_lock);
                  added++;
                  unused--;
          }
-        cfs_spin_unlock(&ns->ns_unused_lock);
-        RETURN(ldlm_cancel_list(cancels, added, cancel_flags));
+        cfs_spin_unlock(&ns->ns_lock);
+        RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
+                          int count, int max, ldlm_cancel_flags_t cancel_flags,
+                          int flags)
+{
+        int added;
+        added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+        if (added <= 0)
+                return added;
+        return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
  }
  
  /* when called with LDLM_ASYNC the blocking callback will be handled
   * in a thread and this function will return after the thread has been
   * asked to call the callback.  when called with LDLM_SYNC the blocking
   * callback will be performed in this function. */
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t mode,
                      int flags)
  {
          CFS_LIST_HEAD(cancels);
@@ -1600,19 +1682,16 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
          ENTRY;
  
  #ifndef __KERNEL__
-        sync = LDLM_SYNC; /* force to be sync in user space */
+        mode = LDLM_SYNC; /* force to be sync in user space */
  #endif
-        count = ldlm_cancel_lru_local(ns, &cancels, nr, 0, 0, flags);
-        if (sync == LDLM_ASYNC) {
-                rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count);
-                if (rc == 0)
-                        RETURN(count);
-        }
+        /* Just prepare the list of locks, do not actually cancel them yet.
+         * Locks are cancelled later in a separate thread. */
+        count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+        rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, mode);
+        if (rc == 0)
+                RETURN(count);
  
-        /* If an error occured in ASYNC mode, or this is SYNC mode,
-         * cancel the list. */
-        ldlm_cli_cancel_list(&cancels, count, NULL, 0);
-        RETURN(count);
+        RETURN(0);
  }
  
  /* Find and cancel locally unused locks found on resource, matched to the
@@ -1667,7 +1746,7 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
          }
          unlock_res(res);
  
-        RETURN(ldlm_cancel_list(cancels, count, cancel_flags));
+        RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
  }
  
  /* If @req is NULL, send CANCEL request to server with handles of locks
@@ -1744,7 +1823,7 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
  
          LDLM_RESOURCE_ADDREF(res);
          count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
-                                           0, flags, opaque);
+                                           0, flags | LCF_BL_AST, opaque);
          rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
          if (rc != ELDLM_OK)
                  CERROR("ldlm_cli_cancel_unused_resource: %d\n", rc);
@@ -1754,16 +1833,27 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
          RETURN(0);
  }
  
-static inline int have_no_nsresource(struct ldlm_namespace *ns)
-{
-        int no_resource = 0;
-
-        cfs_spin_lock(&ns->ns_hash_lock);
-        if (ns->ns_resources == 0)
-                no_resource = 1;
-        cfs_spin_unlock(&ns->ns_hash_lock);
+struct ldlm_cli_cancel_arg {
+        int     lc_flags;
+        void   *lc_opaque;
+};
  
-        RETURN(no_resource);
+static int ldlm_cli_hash_cancel_unused(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                       cfs_hlist_node_t *hnode, void *arg)
+{
+        struct ldlm_resource           *res = cfs_hash_object(hs, hnode);
+        struct ldlm_cli_cancel_arg     *lc = arg;
+        int                             rc;
+
+        rc = ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+                                             NULL, LCK_MINMODE,
+                                             lc->lc_flags, lc->lc_opaque);
+        if (rc != 0) {
+                CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
+                       res->lr_name.name[0], rc);
+        }
+        /* must return 0 for hash iteration */
+        return 0;
  }
  
  /* Cancel all locks on a namespace (or a specific resource, if given)
@@ -1775,48 +1865,25 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
                             const struct ldlm_res_id *res_id,
                             ldlm_cancel_flags_t flags, void *opaque)
  {
-        int i;
+        struct ldlm_cli_cancel_arg arg = {
+                .lc_flags       = flags,
+                .lc_opaque      = opaque,
+        };
+
          ENTRY;
  
          if (ns == NULL)
                  RETURN(ELDLM_OK);
  
-        if (res_id)
+        if (res_id != NULL) {
                  RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
                                                         LCK_MINMODE, flags,
                                                         opaque));
-
-        cfs_spin_lock(&ns->ns_hash_lock);
-        for (i = 0; i < RES_HASH_SIZE; i++) {
-                cfs_list_t *tmp;
-                tmp = ns->ns_hash[i].next;
-                while (tmp != &(ns->ns_hash[i])) {
-                        struct ldlm_resource *res;
-                        int rc;
-
-                        res = cfs_list_entry(tmp, struct ldlm_resource,
-                                             lr_hash);
-                        ldlm_resource_getref(res);
-                        cfs_spin_unlock(&ns->ns_hash_lock);
-
-                        LDLM_RESOURCE_ADDREF(res);
-                        rc = ldlm_cli_cancel_unused_resource(ns, &res->lr_name,
-                                                             NULL, LCK_MINMODE,
-                                                             flags, opaque);
-
-                        if (rc)
-                                CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
-                                       res->lr_name.name[0], rc);
-
-                        LDLM_RESOURCE_DELREF(res);
-                        cfs_spin_lock(&ns->ns_hash_lock);
-                        tmp = tmp->next;
-                        ldlm_resource_putref_locked(res);
-                }
+        } else {
+                cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                         ldlm_cli_hash_cancel_unused, &arg);
+                RETURN(ELDLM_OK);
          }
-        cfs_spin_unlock(&ns->ns_hash_lock);
-
-        RETURN(ELDLM_OK);
  }
  
  /* Lock iterators. */
@@ -1870,49 +1937,25 @@ static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
          return helper->iter(lock, helper->closure);
  }
  
-static int ldlm_res_iter_helper(struct ldlm_resource *res, void *closure)
+static int ldlm_res_iter_helper(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                cfs_hlist_node_t *hnode, void *arg)
+
  {
-        return ldlm_resource_foreach(res, ldlm_iter_helper, closure);
+        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+        return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+               LDLM_ITER_STOP;
  }
  
-int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
-                           void *closure)
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+                            ldlm_iterator_t iter, void *closure)
+
  {
          struct iter_helper_data helper = { iter: iter, closure: closure };
-        return ldlm_namespace_foreach_res(ns, ldlm_res_iter_helper, &helper);
-}
  
-int ldlm_namespace_foreach_res(struct ldlm_namespace *ns,
-                               ldlm_res_iterator_t iter, void *closure)
-{
-        int i, rc = LDLM_ITER_CONTINUE;
-        struct ldlm_resource *res;
-        cfs_list_t *tmp;
+        cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                 ldlm_res_iter_helper, &helper);
  
-        ENTRY;
-        cfs_spin_lock(&ns->ns_hash_lock);
-        for (i = 0; i < RES_HASH_SIZE; i++) {
-                tmp = ns->ns_hash[i].next;
-                while (tmp != &(ns->ns_hash[i])) {
-                        res = cfs_list_entry(tmp, struct ldlm_resource,
-                                             lr_hash);
-                        ldlm_resource_getref(res);
-                        cfs_spin_unlock(&ns->ns_hash_lock);
-                        LDLM_RESOURCE_ADDREF(res);
-
-                        rc = iter(res, closure);
-
-                        LDLM_RESOURCE_DELREF(res);
-                        cfs_spin_lock(&ns->ns_hash_lock);
-                        tmp = tmp->next;
-                        ldlm_resource_putref_locked(res);
-                        if (rc == LDLM_ITER_STOP)
-                                GOTO(out, rc);
-                }
-        }
- out:
-        cfs_spin_unlock(&ns->ns_hash_lock);
-        RETURN(rc);
  }
  
  /* non-blocking function to manipulate a lock whose cb_data is being put away.
@@ -1970,7 +2013,6 @@ static int replay_lock_interpret(const struct lu_env *env,
                                   struct ptlrpc_request *req,
                                   struct ldlm_async_args *aa, int rc)
  {
-        struct lustre_handle  old_hash_key;
          struct ldlm_lock     *lock;
          struct ldlm_reply    *reply;
          struct obd_export    *exp;
@@ -1995,15 +2037,16 @@ static int replay_lock_interpret(const struct lu_env *env,
                  GOTO(out, rc = -ESTALE);
          }
  
-        old_hash_key = lock->l_remote_handle;
-        lock->l_remote_handle = reply->lock_handle;
-
          /* Key change rehash lock in per-export hash with new key */
          exp = req->rq_export;
-        if (exp && exp->exp_lock_hash)
-                cfs_hash_rehash_key(exp->exp_lock_hash, &old_hash_key,
+        if (exp && exp->exp_lock_hash) {
+                cfs_hash_rehash_key(exp->exp_lock_hash,
                                      &lock->l_remote_handle,
+                                    &reply->lock_handle,
                                      &lock->l_exp_hash);
+        } else {
+                lock->l_remote_handle = reply->lock_handle;
+        }
  
          LDLM_DEBUG(lock, "replayed lock:");
          ptlrpc_import_recovery_state_machine(req->rq_import);
@@ -2098,6 +2141,35 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
          RETURN(0);
  }
  
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+        int canceled;
+        CFS_LIST_HEAD(cancels);
+
+        CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+                           "replay for namespace %s (%d)\n",
+                           ldlm_ns_name(ns), ns->ns_nr_unused);
+
+        /* We don't need to care whether or not LRU resize is enabled
+         * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+         * count parameter */
+        canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+                                         LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+        CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+                           canceled, ldlm_ns_name(ns));
+}
+
  int ldlm_replay_locks(struct obd_import *imp)
  {
          struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
@@ -2116,16 +2188,19 @@ int ldlm_replay_locks(struct obd_import *imp)
          /* ensure this doesn't fall to 0 before all have been queued */
          cfs_atomic_inc(&imp->imp_replay_inflight);
  
-        (void)ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+        if (ldlm_cancel_unused_locks_before_replay)
+                ldlm_cancel_unused_locks_for_replay(ns);
+
+        ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
  
          cfs_list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
                  cfs_list_del_init(&lock->l_pending_chain);
                  if (rc) {
-                        LDLM_LOCK_PUT(lock);
+                        LDLM_LOCK_RELEASE(lock);
                          continue; /* or try to do the rest? */
                  }
                  rc = replay_one_lock(imp, lock);
-                LDLM_LOCK_PUT(lock);
+                LDLM_LOCK_RELEASE(lock);
          }
  
          cfs_atomic_dec(&imp->imp_replay_inflight);