Whamcloud - gitweb
b=24375 Fix lvb updating race in enqueue vs completion case
[fs/lustre-release.git] / lustre / ldlm / ldlm_request.c
index f287f1a..5d718c9 100644 (file)
@@ -26,7 +26,7 @@
  * GPL HEADER END
  */
 /*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  */
 /*
@@ -50,6 +50,9 @@ int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT;
 CFS_MODULE_PARM(ldlm_enqueue_min, "i", int, 0644,
                 "lock enqueue timeout minimum");
 
+/* in client side, whether the cached locks will be canceled before replay */
+unsigned int ldlm_cancel_unused_locks_before_replay = 1;
+
 static void interrupted_completion_wait(void *data)
 {
 }
@@ -87,7 +90,7 @@ int ldlm_expired_completion_wait(void *data)
                         last_dump = next_dump;
                         next_dump = cfs_time_shift(300);
                         ldlm_namespace_dump(D_DLMTRACE,
-                                            lock->l_resource->lr_namespace);
+                                            ldlm_lock_to_ns(lock));
                         if (last_dump == 0)
                                 libcfs_debug_dumplog();
                 }
@@ -110,7 +113,7 @@ int ldlm_expired_completion_wait(void *data)
    from a single node. */
 int ldlm_get_enq_timeout(struct ldlm_lock *lock)
 {
-        int timeout = at_get(&lock->l_resource->lr_namespace->ns_at_estimate);
+        int timeout = at_get(ldlm_lock_to_ns_at(lock));
         if (AT_OFF)
                 return obd_timeout / 2;
         /* Since these are non-updating timeouts, we should be conservative.
@@ -140,7 +143,7 @@ static int ldlm_completion_tail(struct ldlm_lock *lock)
                            CFS_DURATION_T"s", delay);
 
                 /* Update our time estimate */
-                at_measured(&lock->l_resource->lr_namespace->ns_at_estimate,
+                at_measured(ldlm_lock_to_ns_at(lock),
                             delay);
                 result = 0;
         }
@@ -252,7 +255,7 @@ noreproc:
                 cfs_spin_unlock(&imp->imp_lock);
         }
 
-        if (ns_is_client(lock->l_resource->lr_namespace) &&
+        if (ns_is_client(ldlm_lock_to_ns(lock)) &&
             OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST,
                                  OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) {
                 lock->l_flags |= LDLM_FL_FAIL_LOC;
@@ -334,7 +337,7 @@ int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
         lock_res_and_lock(lock);
         /* Get this: if ldlm_blocking_ast is racing with intent_policy, such
          * that ldlm_blocking_ast is called just before intent_policy method
-         * takes the ns_lock, then by the time we get the lock, we might not
+         * takes the lr_lock, then by the time we get the lock, we might not
          * be the correct blocking function anymore.  So check, and return
          * early, if so. */
         if (lock->l_blocking_ast != ldlm_blocking_ast) {
@@ -400,13 +403,15 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
         if (unlikely(!lock))
                 GOTO(out_nolock, err = -ENOMEM);
 
-        ldlm_lock_addref_internal(lock, mode);
         ldlm_lock2handle(lock, lockh);
-        lock_res_and_lock(lock);
+
+        /* NB: we don't have any lock now (lock_res_and_lock)
+         * because it's a new lock */
+        ldlm_lock_addref_internal_nolock(lock, mode);
         lock->l_flags |= LDLM_FL_LOCAL;
         if (*flags & LDLM_FL_ATOMIC_CB)
                 lock->l_flags |= LDLM_FL_ATOMIC_CB;
-        unlock_res_and_lock(lock);
+
         if (policy != NULL)
                 lock->l_policy_data = *policy;
         if (client_cookie != NULL)
@@ -481,9 +486,9 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 {
         struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
         int is_replay = *flags & LDLM_FL_REPLAY;
-        struct lustre_handle old_hash_key;
         struct ldlm_lock *lock;
         struct ldlm_reply *reply;
+        struct ost_lvb *tmplvb;
         int cleanup_phase = 1;
         ENTRY;
 
@@ -505,12 +510,11 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
                         if (reply == NULL)
                                 rc = -EPROTO;
                         if (lvb_len) {
-                                struct ost_lvb *tmplvb;
 
                                 req_capsule_set_size(&req->rq_pill,
                                                      &RMF_DLM_LVB, RCL_SERVER,
                                                      lvb_len);
-                            tmplvb = req_capsule_server_get(&req->rq_pill,
+                                tmplvb = req_capsule_server_get(&req->rq_pill,
                                                                  &RMF_DLM_LVB);
                                 if (tmplvb == NULL)
                                         GOTO(cleanup, rc = -EPROTO);
@@ -529,14 +533,15 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
         cleanup_phase = 0;
 
         lock_res_and_lock(lock);
-        old_hash_key = lock->l_remote_handle;
-        lock->l_remote_handle = reply->lock_handle;
-
         /* Key change rehash lock in per-export hash with new key */
-        if (exp->exp_lock_hash)
-                cfs_hash_rehash_key(exp->exp_lock_hash, &old_hash_key,
+        if (exp->exp_lock_hash) {
+                cfs_hash_rehash_key(exp->exp_lock_hash,
                                     &lock->l_remote_handle,
+                                    &reply->lock_handle,
                                     &lock->l_exp_hash);
+        } else {
+                lock->l_remote_handle = reply->lock_handle;
+        }
 
         *flags = reply->lock_flags;
         lock->l_flags |= reply->lock_flags & LDLM_INHERIT_FLAGS;
@@ -601,16 +606,25 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req,
 
         /* If the lock has already been granted by a completion AST, don't
          * clobber the LVB with an older one. */
-        if (lvb_len && (lock->l_req_mode != lock->l_granted_mode)) {
-                void *tmplvb;
+        if (lvb_len) {
+                /* We must lock or a racing completion might update lvb
+                   without letting us know and we'll clobber the correct value.
+                   Cannot unlock after the check either, a that still leaves
+                   a tiny window for completion to get in */
+                lock_res_and_lock(lock);
+                if (lock->l_req_mode != lock->l_granted_mode) {
 
-                req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
-                                     lvb_len);
-                tmplvb = req_capsule_server_get(&req->rq_pill,
-                                                     &RMF_DLM_LVB);
-                if (tmplvb == NULL)
-                        GOTO(cleanup, rc = -EPROTO);
-                memcpy(lock->l_lvb_data, tmplvb, lvb_len);
+                        req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB,
+                                             RCL_SERVER, lvb_len);
+                        tmplvb = req_capsule_server_get(&req->rq_pill,
+                                                             &RMF_DLM_LVB);
+                        if (tmplvb == NULL) {
+                                unlock_res_and_lock(lock);
+                                GOTO(cleanup, rc = -EPROTO);
+                        }
+                        memcpy(lock->l_lvb_data, tmplvb, lvb_len);
+                }
+                unlock_res_and_lock(lock);
         }
 
         if (!is_replay) {
@@ -902,7 +916,7 @@ static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
         struct ldlm_resource *res;
         int rc;
         ENTRY;
-        if (ns_is_client(lock->l_resource->lr_namespace)) {
+        if (ns_is_client(ldlm_lock_to_ns(lock))) {
                 CERROR("Trying to cancel local lock\n");
                 LBUG();
         }
@@ -1025,7 +1039,7 @@ static int ldlm_cli_cancel_local(struct ldlm_lock *lock)
                 }
                 ldlm_lock_cancel(lock);
         } else {
-                if (ns_is_client(lock->l_resource->lr_namespace)) {
+                if (ns_is_client(ldlm_lock_to_ns(lock))) {
                         LDLM_ERROR(lock, "Trying to cancel local lock");
                         LBUG();
                 }
@@ -1257,7 +1271,7 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
                                                   RCL_CLIENT, 0);
                 LASSERT(avail > 0);
 
-                ns = lock->l_resource->lr_namespace;
+                ns = ldlm_lock_to_ns(lock);
                 flags = ns_connect_lru_resize(ns) ?
                         LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED;
                 count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1,
@@ -1269,8 +1283,8 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
 
 /* XXX until we will have compound requests and can cut cancels from generic rpc
  * we need send cancels with LDLM_FL_BL_AST flag as separate rpc */
-static int ldlm_cancel_list(cfs_list_t *cancels, int count,
-                            ldlm_cancel_flags_t flags)
+int ldlm_cli_cancel_list_local(cfs_list_t *cancels, int count,
+                               ldlm_cancel_flags_t flags)
 {
         CFS_LIST_HEAD(head);
         struct ldlm_lock *lock, *next;
@@ -1311,6 +1325,37 @@ static int ldlm_cancel_list(cfs_list_t *cancels, int count,
 }
 
 /**
+ * Cancel as many locks as possible w/o sending any rpcs (e.g. to write back
+ * dirty data, to close a file, ...) or waiting for any rpcs in-flight (e.g.
+ * readahead requests, ...)
+ */
+static ldlm_policy_res_t ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns,
+                                                    struct ldlm_lock *lock,
+                                                    int unused, int added,
+                                                    int count)
+{
+        ldlm_policy_res_t result = LDLM_POLICY_CANCEL_LOCK;
+        ldlm_cancel_for_recovery cb = ns->ns_cancel_for_recovery;
+        lock_res_and_lock(lock);
+
+        /* don't check added & count since we want to process all locks
+         * from unused list */
+        switch (lock->l_resource->lr_type) {
+                case LDLM_EXTENT:
+                case LDLM_IBITS:
+                        if (cb && cb(lock))
+                                break;
+                default:
+                        result = LDLM_POLICY_SKIP_LOCK;
+                        lock->l_flags |= LDLM_FL_SKIPPED;
+                        break;
+        }
+
+        unlock_res_and_lock(lock);
+        RETURN(result);
+}
+
+/**
  * Callback function for lru-resize policy. Makes decision whether to keep
  * \a lock in LRU for current \a LRU size \a unused, added in current scan
  * \a added and number of locks to be preferably canceled \a count.
@@ -1431,6 +1476,9 @@ typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *,
 static ldlm_cancel_lru_policy_t
 ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
 {
+        if (flags & LDLM_CANCEL_NO_WAIT)
+                return ldlm_cancel_no_wait_policy;
+
         if (ns_connect_lru_resize(ns)) {
                 if (flags & LDLM_CANCEL_SHRINK)
                         /* We kill passed number of old locks. */
@@ -1472,18 +1520,23 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags)
  *                              memory pressre policy function;
  *
  * flags & LDLM_CANCEL_AGED -   cancel alocks according to "aged policy".
+ *
+ * flags & LDLM_CANCEL_NO_WAIT - cancel as many unused locks as possible
+ *                               (typically before replaying locks) w/o
+ *                               sending any rpcs or waiting for any
+ *                               outstanding rpc to complete.
  */
-int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
-                          int count, int max, ldlm_cancel_flags_t cancel_flags,
-                          int flags)
+static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, cfs_list_t *cancels,
+                                 int count, int max, int flags)
 {
         ldlm_cancel_lru_policy_t pf;
         struct ldlm_lock *lock, *next;
-        int added = 0, unused;
+        int added = 0, unused, remained;
         ENTRY;
 
-        cfs_spin_lock(&ns->ns_unused_lock);
+        cfs_spin_lock(&ns->ns_lock);
         unused = ns->ns_nr_unused;
+        remained = unused;
 
         if (!ns_connect_lru_resize(ns))
                 count += unused - ns->ns_max_unused;
@@ -1492,6 +1545,12 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
         LASSERT(pf != NULL);
 
         while (!cfs_list_empty(&ns->ns_unused_list)) {
+                ldlm_policy_res_t result;
+
+                /* all unused locks */
+                if (remained-- <= 0)
+                        break;
+
                 /* For any flags, stop scanning if @max is reached. */
                 if (max && added >= max)
                         break;
@@ -1501,6 +1560,11 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                         /* No locks which got blocking requests. */
                         LASSERT(!(lock->l_flags & LDLM_FL_BL_AST));
 
+                        if (flags & LDLM_CANCEL_NO_WAIT &&
+                            lock->l_flags & LDLM_FL_SKIPPED)
+                                /* already processed */
+                                continue;
+
                         /* Somebody is already doing CANCEL. No need in this
                          * lock in lru, do not traverse it again. */
                         if (!(lock->l_flags & LDLM_FL_CANCELING))
@@ -1512,7 +1576,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                         break;
 
                 LDLM_LOCK_GET(lock);
-                cfs_spin_unlock(&ns->ns_unused_lock);
+                cfs_spin_unlock(&ns->ns_lock);
                 lu_ref_add(&lock->l_reference, __FUNCTION__, cfs_current());
 
                 /* Pass the lock through the policy filter and see if it
@@ -1528,14 +1592,21 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                  * old locks, but additionally chose them by
                  * their weight. Big extent locks will stay in
                  * the cache. */
-                if (pf(ns, lock, unused, added, count) ==
-                    LDLM_POLICY_KEEP_LOCK) {
+                result = pf(ns, lock, unused, added, count);
+                if (result == LDLM_POLICY_KEEP_LOCK) {
                         lu_ref_del(&lock->l_reference,
                                    __FUNCTION__, cfs_current());
                         LDLM_LOCK_RELEASE(lock);
-                        cfs_spin_lock(&ns->ns_unused_lock);
+                        cfs_spin_lock(&ns->ns_lock);
                         break;
                 }
+                if (result == LDLM_POLICY_SKIP_LOCK) {
+                        lu_ref_del(&lock->l_reference,
+                                   __FUNCTION__, cfs_current());
+                        LDLM_LOCK_RELEASE(lock);
+                        cfs_spin_lock(&ns->ns_lock);
+                        continue;
+                }
 
                 lock_res_and_lock(lock);
                 /* Check flags again under the lock. */
@@ -1550,7 +1621,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                         lu_ref_del(&lock->l_reference,
                                    __FUNCTION__, cfs_current());
                         LDLM_LOCK_RELEASE(lock);
-                        cfs_spin_lock(&ns->ns_unused_lock);
+                        cfs_spin_lock(&ns->ns_lock);
                         continue;
                 }
                 LASSERT(!lock->l_readers && !lock->l_writers);
@@ -1572,7 +1643,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
 
                 /* We can't re-add to l_lru as it confuses the
                  * refcounting in ldlm_lock_remove_from_lru() if an AST
-                 * arrives after we drop ns_lock below. We use l_bl_ast
+                 * arrives after we drop lr_lock below. We use l_bl_ast
                  * and can't use l_pending_chain as it is used both on
                  * server and client nevertheless bug 5666 says it is
                  * used only on server */
@@ -1580,19 +1651,30 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
                 cfs_list_add(&lock->l_bl_ast, cancels);
                 unlock_res_and_lock(lock);
                 lu_ref_del(&lock->l_reference, __FUNCTION__, cfs_current());
-                cfs_spin_lock(&ns->ns_unused_lock);
+                cfs_spin_lock(&ns->ns_lock);
                 added++;
                 unused--;
         }
-        cfs_spin_unlock(&ns->ns_unused_lock);
-        RETURN(ldlm_cancel_list(cancels, added, cancel_flags));
+        cfs_spin_unlock(&ns->ns_lock);
+        RETURN(added);
+}
+
+int ldlm_cancel_lru_local(struct ldlm_namespace *ns, cfs_list_t *cancels,
+                          int count, int max, ldlm_cancel_flags_t cancel_flags,
+                          int flags)
+{
+        int added;
+        added = ldlm_prepare_lru_list(ns, cancels, count, max, flags);
+        if (added <= 0)
+                return added;
+        return ldlm_cli_cancel_list_local(cancels, added, cancel_flags);
 }
 
 /* when called with LDLM_ASYNC the blocking callback will be handled
  * in a thread and this function will return after the thread has been
  * asked to call the callback.  when called with LDLM_SYNC the blocking
  * callback will be performed in this function. */
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t mode,
                     int flags)
 {
         CFS_LIST_HEAD(cancels);
@@ -1600,19 +1682,16 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
         ENTRY;
 
 #ifndef __KERNEL__
-        sync = LDLM_SYNC; /* force to be sync in user space */
+        mode = LDLM_SYNC; /* force to be sync in user space */
 #endif
-        count = ldlm_cancel_lru_local(ns, &cancels, nr, 0, 0, flags);
-        if (sync == LDLM_ASYNC) {
-                rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count);
-                if (rc == 0)
-                        RETURN(count);
-        }
+        /* Just prepare the list of locks, do not actually cancel them yet.
+         * Locks are cancelled later in a separate thread. */
+        count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags);
+        rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, mode);
+        if (rc == 0)
+                RETURN(count);
 
-        /* If an error occured in ASYNC mode, or this is SYNC mode,
-         * cancel the list. */
-        ldlm_cli_cancel_list(&cancels, count, NULL, 0);
-        RETURN(count);
+        RETURN(0);
 }
 
 /* Find and cancel locally unused locks found on resource, matched to the
@@ -1667,7 +1746,7 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res,
         }
         unlock_res(res);
 
-        RETURN(ldlm_cancel_list(cancels, count, cancel_flags));
+        RETURN(ldlm_cli_cancel_list_local(cancels, count, cancel_flags));
 }
 
 /* If @req is NULL, send CANCEL request to server with handles of locks
@@ -1744,7 +1823,7 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
 
         LDLM_RESOURCE_ADDREF(res);
         count = ldlm_cancel_resource_local(res, &cancels, policy, mode,
-                                           0, flags, opaque);
+                                           0, flags | LCF_BL_AST, opaque);
         rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags);
         if (rc != ELDLM_OK)
                 CERROR("ldlm_cli_cancel_unused_resource: %d\n", rc);
@@ -1754,16 +1833,27 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
         RETURN(0);
 }
 
-static inline int have_no_nsresource(struct ldlm_namespace *ns)
-{
-        int no_resource = 0;
-
-        cfs_spin_lock(&ns->ns_hash_lock);
-        if (ns->ns_resources == 0)
-                no_resource = 1;
-        cfs_spin_unlock(&ns->ns_hash_lock);
+struct ldlm_cli_cancel_arg {
+        int     lc_flags;
+        void   *lc_opaque;
+};
 
-        RETURN(no_resource);
+static int ldlm_cli_hash_cancel_unused(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                       cfs_hlist_node_t *hnode, void *arg)
+{
+        struct ldlm_resource           *res = cfs_hash_object(hs, hnode);
+        struct ldlm_cli_cancel_arg     *lc = arg;
+        int                             rc;
+
+        rc = ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name,
+                                             NULL, LCK_MINMODE,
+                                             lc->lc_flags, lc->lc_opaque);
+        if (rc != 0) {
+                CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
+                       res->lr_name.name[0], rc);
+        }
+        /* must return 0 for hash iteration */
+        return 0;
 }
 
 /* Cancel all locks on a namespace (or a specific resource, if given)
@@ -1775,48 +1865,25 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
                            const struct ldlm_res_id *res_id,
                            ldlm_cancel_flags_t flags, void *opaque)
 {
-        int i;
+        struct ldlm_cli_cancel_arg arg = {
+                .lc_flags       = flags,
+                .lc_opaque      = opaque,
+        };
+
         ENTRY;
 
         if (ns == NULL)
                 RETURN(ELDLM_OK);
 
-        if (res_id)
+        if (res_id != NULL) {
                 RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, NULL,
                                                        LCK_MINMODE, flags,
                                                        opaque));
-
-        cfs_spin_lock(&ns->ns_hash_lock);
-        for (i = 0; i < RES_HASH_SIZE; i++) {
-                cfs_list_t *tmp;
-                tmp = ns->ns_hash[i].next;
-                while (tmp != &(ns->ns_hash[i])) {
-                        struct ldlm_resource *res;
-                        int rc;
-
-                        res = cfs_list_entry(tmp, struct ldlm_resource,
-                                             lr_hash);
-                        ldlm_resource_getref(res);
-                        cfs_spin_unlock(&ns->ns_hash_lock);
-
-                        LDLM_RESOURCE_ADDREF(res);
-                        rc = ldlm_cli_cancel_unused_resource(ns, &res->lr_name,
-                                                             NULL, LCK_MINMODE,
-                                                             flags, opaque);
-
-                        if (rc)
-                                CERROR("ldlm_cli_cancel_unused ("LPU64"): %d\n",
-                                       res->lr_name.name[0], rc);
-
-                        LDLM_RESOURCE_DELREF(res);
-                        cfs_spin_lock(&ns->ns_hash_lock);
-                        tmp = tmp->next;
-                        ldlm_resource_putref_locked(res);
-                }
+        } else {
+                cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                         ldlm_cli_hash_cancel_unused, &arg);
+                RETURN(ELDLM_OK);
         }
-        cfs_spin_unlock(&ns->ns_hash_lock);
-
-        RETURN(ELDLM_OK);
 }
 
 /* Lock iterators. */
@@ -1870,49 +1937,25 @@ static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure)
         return helper->iter(lock, helper->closure);
 }
 
-static int ldlm_res_iter_helper(struct ldlm_resource *res, void *closure)
+static int ldlm_res_iter_helper(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+                                cfs_hlist_node_t *hnode, void *arg)
+
 {
-        return ldlm_resource_foreach(res, ldlm_iter_helper, closure);
+        struct ldlm_resource *res = cfs_hash_object(hs, hnode);
+
+        return ldlm_resource_foreach(res, ldlm_iter_helper, arg) ==
+               LDLM_ITER_STOP;
 }
 
-int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter,
-                           void *closure)
+void ldlm_namespace_foreach(struct ldlm_namespace *ns,
+                            ldlm_iterator_t iter, void *closure)
+
 {
         struct iter_helper_data helper = { iter: iter, closure: closure };
-        return ldlm_namespace_foreach_res(ns, ldlm_res_iter_helper, &helper);
-}
 
-int ldlm_namespace_foreach_res(struct ldlm_namespace *ns,
-                               ldlm_res_iterator_t iter, void *closure)
-{
-        int i, rc = LDLM_ITER_CONTINUE;
-        struct ldlm_resource *res;
-        cfs_list_t *tmp;
+        cfs_hash_for_each_nolock(ns->ns_rs_hash,
+                                 ldlm_res_iter_helper, &helper);
 
-        ENTRY;
-        cfs_spin_lock(&ns->ns_hash_lock);
-        for (i = 0; i < RES_HASH_SIZE; i++) {
-                tmp = ns->ns_hash[i].next;
-                while (tmp != &(ns->ns_hash[i])) {
-                        res = cfs_list_entry(tmp, struct ldlm_resource,
-                                             lr_hash);
-                        ldlm_resource_getref(res);
-                        cfs_spin_unlock(&ns->ns_hash_lock);
-                        LDLM_RESOURCE_ADDREF(res);
-
-                        rc = iter(res, closure);
-
-                        LDLM_RESOURCE_DELREF(res);
-                        cfs_spin_lock(&ns->ns_hash_lock);
-                        tmp = tmp->next;
-                        ldlm_resource_putref_locked(res);
-                        if (rc == LDLM_ITER_STOP)
-                                GOTO(out, rc);
-                }
-        }
- out:
-        cfs_spin_unlock(&ns->ns_hash_lock);
-        RETURN(rc);
 }
 
 /* non-blocking function to manipulate a lock whose cb_data is being put away.
@@ -1970,7 +2013,6 @@ static int replay_lock_interpret(const struct lu_env *env,
                                  struct ptlrpc_request *req,
                                  struct ldlm_async_args *aa, int rc)
 {
-        struct lustre_handle  old_hash_key;
         struct ldlm_lock     *lock;
         struct ldlm_reply    *reply;
         struct obd_export    *exp;
@@ -1995,15 +2037,16 @@ static int replay_lock_interpret(const struct lu_env *env,
                 GOTO(out, rc = -ESTALE);
         }
 
-        old_hash_key = lock->l_remote_handle;
-        lock->l_remote_handle = reply->lock_handle;
-
         /* Key change rehash lock in per-export hash with new key */
         exp = req->rq_export;
-        if (exp && exp->exp_lock_hash)
-                cfs_hash_rehash_key(exp->exp_lock_hash, &old_hash_key,
+        if (exp && exp->exp_lock_hash) {
+                cfs_hash_rehash_key(exp->exp_lock_hash,
                                     &lock->l_remote_handle,
+                                    &reply->lock_handle,
                                     &lock->l_exp_hash);
+        } else {
+                lock->l_remote_handle = reply->lock_handle;
+        }
 
         LDLM_DEBUG(lock, "replayed lock:");
         ptlrpc_import_recovery_state_machine(req->rq_import);
@@ -2098,6 +2141,35 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
         RETURN(0);
 }
 
+/**
+ * Cancel as many unused locks as possible before replay. since we are
+ * in recovery, we can't wait for any outstanding RPCs to send any RPC
+ * to the server.
+ *
+ * Called only in recovery before replaying locks. there is no need to
+ * replay locks that are unused. since the clients may hold thousands of
+ * cached unused locks, dropping the unused locks can greatly reduce the
+ * load on the servers at recovery time.
+ */
+static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns)
+{
+        int canceled;
+        CFS_LIST_HEAD(cancels);
+
+        CDEBUG(D_DLMTRACE, "Dropping as many unused locks as possible before"
+                           "replay for namespace %s (%d)\n",
+                           ldlm_ns_name(ns), ns->ns_nr_unused);
+
+        /* We don't need to care whether or not LRU resize is enabled
+         * because the LDLM_CANCEL_NO_WAIT policy doesn't use the
+         * count parameter */
+        canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0,
+                                         LCF_LOCAL, LDLM_CANCEL_NO_WAIT);
+
+        CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n",
+                           canceled, ldlm_ns_name(ns));
+}
+
 int ldlm_replay_locks(struct obd_import *imp)
 {
         struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
@@ -2116,16 +2188,19 @@ int ldlm_replay_locks(struct obd_import *imp)
         /* ensure this doesn't fall to 0 before all have been queued */
         cfs_atomic_inc(&imp->imp_replay_inflight);
 
-        (void)ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
+        if (ldlm_cancel_unused_locks_before_replay)
+                ldlm_cancel_unused_locks_for_replay(ns);
+
+        ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list);
 
         cfs_list_for_each_entry_safe(lock, next, &list, l_pending_chain) {
                 cfs_list_del_init(&lock->l_pending_chain);
                 if (rc) {
-                        LDLM_LOCK_PUT(lock);
+                        LDLM_LOCK_RELEASE(lock);
                         continue; /* or try to do the rest? */
                 }
                 rc = replay_one_lock(imp, lock);
-                LDLM_LOCK_PUT(lock);
+                LDLM_LOCK_RELEASE(lock);
         }
 
         cfs_atomic_dec(&imp->imp_replay_inflight);