Whamcloud - gitweb
Revert "b=19808 2.6.29-fc11 patchless client support"
[fs/lustre-release.git] / lustre / obdclass / cl_lock.c
index 9685ac1..ce219a2 100644 (file)
@@ -127,6 +127,23 @@ static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
         return &info->clt_counters[nesting];
 }
 
+static void cl_lock_trace0(int level, const struct lu_env *env,
+                           const char *prefix, const struct cl_lock *lock,
+                           const char *func, const int line)
+{
+        struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+        CDEBUG(level, "%s: %p@(%i %p %i %d %d %d %d %lx)"
+                      "(%p/%d/%i) at %s():%d\n",
+               prefix, lock,
+               atomic_read(&lock->cll_ref), lock->cll_guarder, lock->cll_depth,
+               lock->cll_state, lock->cll_error, lock->cll_holds,
+               lock->cll_users, lock->cll_flags,
+               env, h->coh_nesting, cl_lock_nr_mutexed(env),
+               func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)                         \
+        cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
 #define RETIP ((unsigned long)__builtin_return_address(0))
 
 #ifdef CONFIG_LOCKDEP
@@ -244,6 +261,7 @@ static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
         LINVRNT(!cl_lock_is_mutexed(lock));
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
         might_sleep();
         while (!list_empty(&lock->cll_layers)) {
                 struct cl_lock_slice *slice;
@@ -286,7 +304,7 @@ void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
         head = cl_object_header(obj);
         site = cl_object_site(obj);
 
-        CDEBUG(D_DLMTRACE, "releasing reference: %d %p %lu\n",
+        CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
                atomic_read(&lock->cll_ref), lock, RETIP);
 
         if (atomic_dec_and_test(&lock->cll_ref)) {
@@ -311,7 +329,7 @@ EXPORT_SYMBOL(cl_lock_put);
 void cl_lock_get(struct cl_lock *lock)
 {
         LINVRNT(cl_lock_invariant(NULL, lock));
-        CDEBUG(D_DLMTRACE|D_TRACE, "acquiring reference: %d %p %lu\n",
+        CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
                atomic_read(&lock->cll_ref), lock, RETIP);
         atomic_inc(&lock->cll_ref);
 }
@@ -331,7 +349,7 @@ void cl_lock_get_trust(struct cl_lock *lock)
         struct cl_site *site = cl_object_site(lock->cll_descr.cld_obj);
 
         LASSERT(cl_is_lock(lock));
-        CDEBUG(D_DLMTRACE|D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+        CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
                atomic_read(&lock->cll_ref), lock, RETIP);
         if (atomic_inc_return(&lock->cll_ref) == 1)
                 atomic_inc(&site->cs_locks.cs_busy);
@@ -347,6 +365,7 @@ EXPORT_SYMBOL(cl_lock_get_trust);
 static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
 {
         cl_lock_mutex_get(env, lock);
+        cl_lock_cancel(env, lock);
         cl_lock_delete(env, lock);
         cl_lock_mutex_put(env, lock);
         cl_lock_put(env, lock);
@@ -399,6 +418,57 @@ static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
 }
 
 /**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre  state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+                                     struct cl_lock *lock)
+{
+        enum cl_lock_state state = lock->cll_state;
+
+        LASSERT(cl_lock_is_mutexed(lock));
+        LASSERT(state != CLS_INTRANSIT);
+        LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+                 "Malformed lock state %d.\n", state);
+
+        cl_lock_state_set(env, lock, CLS_INTRANSIT);
+        lock->cll_intransit_owner = cfs_current();
+        cl_lock_hold_add(env, lock, "intransit", cfs_current());
+        return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ *  Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+                       enum cl_lock_state state)
+{
+        LASSERT(cl_lock_is_mutexed(lock));
+        LASSERT(lock->cll_state == CLS_INTRANSIT);
+        LASSERT(state != CLS_INTRANSIT);
+        LASSERT(lock->cll_intransit_owner == cfs_current());
+
+        lock->cll_intransit_owner = NULL;
+        cl_lock_state_set(env, lock, state);
+        cl_lock_unhold(env, lock, "intransit", cfs_current());
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+        LASSERT(cl_lock_is_mutexed(lock));
+        return lock->cll_state == CLS_INTRANSIT &&
+               lock->cll_intransit_owner != cfs_current();
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
  * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
  * truncate and O_APPEND cannot be reused for read/non-append-write, as they
  * cover multiple stripes and can trigger cascading timeouts.
@@ -440,16 +510,15 @@ static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
 
                 LASSERT(cl_is_lock(lock));
                 matched = cl_lock_ext_match(&lock->cll_descr, need) &&
-                        lock->cll_state < CLS_FREEING &&
-                        !(lock->cll_flags & CLF_CANCELLED) &&
-                        cl_lock_fits_into(env, lock, need, io);
+                          lock->cll_state < CLS_FREEING &&
+                          lock->cll_error == 0 &&
+                          !(lock->cll_flags & CLF_CANCELLED) &&
+                          cl_lock_fits_into(env, lock, need, io);
                 CDEBUG(D_DLMTRACE, "has: "DDESCR"(%i) need: "DDESCR": %d\n",
                        PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
                        matched);
                 if (matched) {
                         cl_lock_get_trust(lock);
-                        /* move the lock to the LRU head */
-                        list_move(&lock->cll_linkage, &head->coh_locks);
                         atomic_inc(&cl_object_site(obj)->cs_locks.cs_hit);
                         RETURN(lock);
                 }
@@ -494,7 +563,7 @@ static struct cl_lock *cl_lock_find(const struct lu_env *env,
                         spin_lock(&head->coh_lock_guard);
                         ghost = cl_lock_lookup(env, obj, io, need);
                         if (ghost == NULL) {
-                                list_add(&lock->cll_linkage, &head->coh_locks);
+                                list_add_tail(&lock->cll_linkage, &head->coh_locks);
                                 spin_unlock(&head->coh_lock_guard);
                                 atomic_inc(&site->cs_locks.cs_busy);
                         } else {
@@ -524,6 +593,7 @@ struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
         struct cl_object_header *head;
         struct cl_object        *obj;
         struct cl_lock          *lock;
+        int ok;
 
         obj  = need->cld_obj;
         head = cl_object_header(obj);
@@ -532,24 +602,30 @@ struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
         lock = cl_lock_lookup(env, obj, io, need);
         spin_unlock(&head->coh_lock_guard);
 
-        if (lock != NULL) {
-                int ok;
+        if (lock == NULL)
+                return NULL;
 
-                cl_lock_mutex_get(env, lock);
-                if (lock->cll_state == CLS_CACHED)
-                        cl_use_try(env, lock);
-                ok = lock->cll_state == CLS_HELD;
-                if (ok) {
-                        cl_lock_hold_add(env, lock, scope, source);
-                        cl_lock_user_add(env, lock);
-                        cl_lock_put(env, lock);
-                }
-                cl_lock_mutex_put(env, lock);
-                if (!ok) {
-                        cl_lock_put(env, lock);
-                        lock = NULL;
-                }
+        cl_lock_mutex_get(env, lock);
+        if (lock->cll_state == CLS_INTRANSIT)
+                cl_lock_state_wait(env, lock); /* Don't care return value. */
+        if (lock->cll_state == CLS_CACHED) {
+                int result;
+                result = cl_use_try(env, lock, 1);
+                if (result < 0)
+                        cl_lock_error(env, lock, result);
         }
+        ok = lock->cll_state == CLS_HELD;
+        if (ok) {
+                cl_lock_hold_add(env, lock, scope, source);
+                cl_lock_user_add(env, lock);
+                cl_lock_put(env, lock);
+        }
+        cl_lock_mutex_put(env, lock);
+        if (!ok) {
+                cl_lock_put(env, lock);
+                lock = NULL;
+        }
+
         return lock;
 }
 EXPORT_SYMBOL(cl_lock_peek);
@@ -576,14 +652,6 @@ const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
 }
 EXPORT_SYMBOL(cl_lock_at);
 
-static void cl_lock_trace(struct cl_thread_counters *counters,
-                          const char *prefix, const struct cl_lock *lock)
-{
-        CDEBUG(D_DLMTRACE|D_TRACE, "%s: %i@%p %p %i %i\n", prefix,
-               atomic_read(&lock->cll_ref), lock, lock->cll_guarder,
-               lock->cll_depth, counters->ctc_nr_locks_locked);
-}
-
 static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
 {
         struct cl_thread_counters *counters;
@@ -592,7 +660,7 @@ static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
         lock->cll_depth++;
         counters->ctc_nr_locks_locked++;
         lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
-        cl_lock_trace(counters, "got mutex", lock);
+        cl_lock_trace(D_TRACE, env, "got mutex", lock);
 }
 
 /**
@@ -666,7 +734,7 @@ int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
 EXPORT_SYMBOL(cl_lock_mutex_try);
 
 /**
- * Unlocks cl_lock object.
{* Unlocks cl_lock object.
  *
  * \pre cl_lock_is_mutexed(lock)
  *
@@ -684,7 +752,7 @@ void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
         counters = cl_lock_counters(env, lock);
         LINVRNT(counters->ctc_nr_locks_locked > 0);
 
-        cl_lock_trace(counters, "put mutex", lock);
+        cl_lock_trace(D_TRACE, env, "put mutex", lock);
         lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
         counters->ctc_nr_locks_locked--;
         if (--lock->cll_depth == 0) {
@@ -752,28 +820,14 @@ static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
 
         ENTRY;
         if (lock->cll_state < CLS_FREEING) {
+                LASSERT(lock->cll_state != CLS_INTRANSIT);
                 cl_lock_state_set(env, lock, CLS_FREEING);
 
                 head = cl_object_header(lock->cll_descr.cld_obj);
 
                 spin_lock(&head->coh_lock_guard);
                 list_del_init(&lock->cll_linkage);
-                /*
-                 * No locks, no pages. This is only valid for bottom sub-locks
-                 * and head->coh_nesting == 1 check assumes two level top-sub
-                 * hierarchy.
-                 */
-                /*
-                 * The count of pages of this object may NOT be zero because
-                 * we don't cleanup the pages if they are in CPS_FREEING state.
-                 * See cl_page_gang_lookup().
-                 *
-                 * It is safe to leave the CPS_FREEING pages in cache w/o
-                 * a lock, because those page must not be uptodate.
-                 * See cl_page_delete0 for details.
-                 */
-                /* LASSERT(!ergo(head->coh_nesting == 1 &&
-                           list_empty(&head->coh_locks), !head->coh_pages)); */
+
                 spin_unlock(&head->coh_lock_guard);
                 /*
                  * From now on, no new references to this lock can be acquired
@@ -845,6 +899,7 @@ static void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
         LASSERT(lock->cll_holds > 0);
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
         lu_ref_del(&lock->cll_holders, scope, source);
         cl_lock_hold_mod(env, lock, -1);
         if (lock->cll_holds == 0) {
@@ -899,8 +954,9 @@ int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
         LASSERT(lock->cll_depth == 1);
         LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
 
+        cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
         result = lock->cll_error;
-        if (result == 0 && !(lock->cll_flags & CLF_STATE)) {
+        if (result == 0) {
                 cfs_waitlink_init(&waiter);
                 cfs_waitq_add(&lock->cll_wq, &waiter);
                 set_current_state(CFS_TASK_INTERRUPTIBLE);
@@ -914,7 +970,6 @@ int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
                 cfs_waitq_del(&lock->cll_wq, &waiter);
                 result = cfs_signal_pending() ? -EINTR : 0;
         }
-        lock->cll_flags &= ~CLF_STATE;
         RETURN(result);
 }
 EXPORT_SYMBOL(cl_lock_state_wait);
@@ -931,7 +986,6 @@ static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
         list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
                 if (slice->cls_ops->clo_state != NULL)
                         slice->cls_ops->clo_state(env, slice, state);
-        lock->cll_flags |= CLF_STATE;
         cfs_waitq_broadcast(&lock->cll_wq);
         EXIT;
 }
@@ -946,6 +1000,7 @@ static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
 void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
 {
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
         cl_lock_state_signal(env, lock, lock->cll_state);
         EXIT;
 }
@@ -970,9 +1025,10 @@ void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
         LASSERT(lock->cll_state <= state ||
                 (lock->cll_state == CLS_CACHED &&
                  (state == CLS_HELD || /* lock found in cache */
-                  state == CLS_NEW     /* sub-lock canceled */)) ||
-                /* sub-lock canceled during unlocking */
-                (lock->cll_state == CLS_UNLOCKING && state == CLS_NEW));
+                  state == CLS_NEW  ||   /* sub-lock canceled */
+                  state == CLS_INTRANSIT)) ||
+                /* lock is in transit state */
+                lock->cll_state == CLS_INTRANSIT);
 
         if (lock->cll_state != state) {
                 atomic_dec(&site->cs_locks_state[lock->cll_state]);
@@ -985,17 +1041,54 @@ void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
 }
 EXPORT_SYMBOL(cl_lock_state_set);
 
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+        const struct cl_lock_slice *slice;
+        int result;
+
+        do {
+                result = 0;
+
+                LINVRNT(cl_lock_is_mutexed(lock));
+                LINVRNT(cl_lock_invariant(env, lock));
+                LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+                result = -ENOSYS;
+                list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                            cls_linkage) {
+                        if (slice->cls_ops->clo_unuse != NULL) {
+                                result = slice->cls_ops->clo_unuse(env, slice);
+                                if (result != 0)
+                                        break;
+                        }
+                }
+                LASSERT(result != -ENOSYS);
+        } while (result == CLO_REPEAT);
+
+        return result;
+}
+
 /**
  * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
  * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ *  use process atomic
  */
-int cl_use_try(const struct lu_env *env, struct cl_lock *lock)
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
 {
-        int result;
         const struct cl_lock_slice *slice;
+        int result;
+        enum cl_lock_state state;
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+        LASSERT(lock->cll_state == CLS_CACHED);
+        if (lock->cll_error)
+                RETURN(lock->cll_error);
+
         result = -ENOSYS;
+        state = cl_lock_intransit(env, lock);
         list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
                 if (slice->cls_ops->clo_use != NULL) {
                         result = slice->cls_ops->clo_use(env, slice);
@@ -1004,8 +1097,34 @@ int cl_use_try(const struct lu_env *env, struct cl_lock *lock)
                 }
         }
         LASSERT(result != -ENOSYS);
-        if (result == 0)
-                cl_lock_state_set(env, lock, CLS_HELD);
+
+        LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+                 lock->cll_state);
+
+        if (result == 0) {
+                state = CLS_HELD;
+        } else {
+                if (result == -ESTALE) {
+                        /*
+                         * ESTALE means sublock being cancelled
+                         * at this time, and set lock state to
+                         * be NEW here and ask the caller to repeat.
+                         */
+                        state = CLS_NEW;
+                        result = CLO_REPEAT;
+                }
+
+                /* @atomic means back-off-on-failure. */
+                if (atomic) {
+                        int rc;
+                        rc = cl_unuse_try_internal(env, lock);
+                        /* Vet the results. */
+                        if (rc < 0 && result > 0)
+                                result = rc;
+                }
+
+        }
+        cl_lock_extransit(env, lock, state);
         RETURN(result);
 }
 EXPORT_SYMBOL(cl_use_try);
@@ -1054,6 +1173,7 @@ int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
         int result;
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
         do {
                 result = 0;
 
@@ -1071,14 +1191,13 @@ int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
                         if (result == 0)
                                 cl_lock_state_set(env, lock, CLS_ENQUEUED);
                         break;
-                case CLS_UNLOCKING:
-                        /* wait until unlocking finishes, and enqueue lock
-                         * afresh. */
+                case CLS_INTRANSIT:
+                        LASSERT(cl_lock_is_intransit(lock));
                         result = CLO_WAIT;
                         break;
                 case CLS_CACHED:
                         /* yank lock from the cache. */
-                        result = cl_use_try(env, lock);
+                        result = cl_use_try(env, lock, 0);
                         break;
                 case CLS_ENQUEUED:
                 case CLS_HELD:
@@ -1123,8 +1242,7 @@ static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
         } while (1);
         if (result != 0) {
                 cl_lock_user_del(env, lock);
-                if (result != -EINTR)
-                        cl_lock_error(env, lock, result);
+                cl_lock_error(env, lock, result);
         }
         LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
                      lock->cll_state == CLS_HELD));
@@ -1164,8 +1282,9 @@ EXPORT_SYMBOL(cl_enqueue);
  *
  * This function is called repeatedly by cl_unuse() until either lock is
  * unlocked, or error occurs.
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
  *
- * \ppre lock->cll_state <= CLS_HELD || lock->cll_state == CLS_UNLOCKING
+ * \pre  lock->cll_state == CLS_HELD
  *
  * \post ergo(result == 0, lock->cll_state == CLS_CACHED)
  *
@@ -1174,57 +1293,31 @@ EXPORT_SYMBOL(cl_enqueue);
  */
 int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
 {
-        const struct cl_lock_slice *slice;
         int                         result;
+        enum cl_lock_state          state = CLS_NEW;
 
         ENTRY;
-        if (lock->cll_state != CLS_UNLOCKING) {
-                if (lock->cll_users > 1) {
-                        cl_lock_user_del(env, lock);
-                        RETURN(0);
-                }
-                /*
-                 * New lock users (->cll_users) are not protecting unlocking
-                 * from proceeding. From this point, lock eventually reaches
-                 * CLS_CACHED, is reinitialized to CLS_NEW or fails into
-                 * CLS_FREEING.
-                 */
-                cl_lock_state_set(env, lock, CLS_UNLOCKING);
-        }
-        do {
-                result = 0;
+        cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
 
-                if (lock->cll_error != 0)
-                        break;
+        LASSERT(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED);
+        if (lock->cll_users > 1) {
+                cl_lock_user_del(env, lock);
+                RETURN(0);
+        }
 
-                LINVRNT(cl_lock_is_mutexed(lock));
-                LINVRNT(cl_lock_invariant(env, lock));
-                LASSERT(lock->cll_state == CLS_UNLOCKING);
-                LASSERT(lock->cll_users > 0);
-                LASSERT(lock->cll_holds > 0);
+        /*
+         * New lock users (->cll_users) are not protecting unlocking
+         * from proceeding. From this point, lock eventually reaches
+         * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+         * CLS_FREEING.
+         */
+        state = cl_lock_intransit(env, lock);
 
-                result = -ENOSYS;
-                list_for_each_entry_reverse(slice, &lock->cll_layers,
-                                            cls_linkage) {
-                        if (slice->cls_ops->clo_unuse != NULL) {
-                                result = slice->cls_ops->clo_unuse(env, slice);
-                                if (result != 0)
-                                        break;
-                        }
-                }
-                LASSERT(result != -ENOSYS);
-        } while (result == CLO_REPEAT);
-        if (result != CLO_WAIT)
-                /*
-                 * Once there is no more need to iterate ->clo_unuse() calls,
-                 * remove lock user. This is done even if unrecoverable error
-                 * happened during unlocking, because nothing else can be
-                 * done.
-                 */
-                cl_lock_user_del(env, lock);
+        result = cl_unuse_try_internal(env, lock);
+        LASSERT(lock->cll_state == CLS_INTRANSIT);
+        LASSERT(result != CLO_WAIT);
+        cl_lock_user_del(env, lock);
         if (result == 0 || result == -ESTALE) {
-                enum cl_lock_state state;
-
                 /*
                  * Return lock back to the cache. This is the only
                  * place where lock is moved into CLS_CACHED state.
@@ -1234,8 +1327,11 @@ int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
                  * re-initialized. This happens e.g., when a sub-lock was
                  * canceled while unlocking was in progress.
                  */
-                state = result == 0 ? CLS_CACHED : CLS_NEW;
-                cl_lock_state_set(env, lock, state);
+                if (state == CLS_HELD && result == 0)
+                        state = CLS_CACHED;
+                else
+                        state = CLS_NEW;
+                cl_lock_extransit(env, lock, state);
 
                 /*
                  * Hide -ESTALE error.
@@ -1247,7 +1343,11 @@ int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
                  * pages won't be written to OSTs. -jay
                  */
                 result = 0;
+        } else {
+                CERROR("result = %d, this is unlikely!\n", result);
+                cl_lock_extransit(env, lock, state);
         }
+
         result = result ?: lock->cll_error;
         if (result < 0)
                 cl_lock_error(env, lock, result);
@@ -1257,19 +1357,13 @@ EXPORT_SYMBOL(cl_unuse_try);
 
 static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
 {
+        int result;
         ENTRY;
-        LASSERT(lock->cll_state <= CLS_HELD);
-        do {
-                int result;
 
-                result = cl_unuse_try(env, lock);
-                if (result == CLO_WAIT) {
-                        result = cl_lock_state_wait(env, lock);
-                        if (result == 0)
-                                continue;
-                }
-                break;
-        } while (1);
+        result = cl_unuse_try(env, lock);
+        if (result)
+                CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+
         EXIT;
 }
 
@@ -1303,17 +1397,25 @@ int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
         int                         result;
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
         do {
                 LINVRNT(cl_lock_is_mutexed(lock));
                 LINVRNT(cl_lock_invariant(env, lock));
                 LASSERT(lock->cll_state == CLS_ENQUEUED ||
-                        lock->cll_state == CLS_HELD);
+                        lock->cll_state == CLS_HELD ||
+                        lock->cll_state == CLS_INTRANSIT);
                 LASSERT(lock->cll_users > 0);
                 LASSERT(lock->cll_holds > 0);
 
                 result = 0;
                 if (lock->cll_error != 0)
                         break;
+
+                if (cl_lock_is_intransit(lock)) {
+                        result = CLO_WAIT;
+                        break;
+                }
+
                 if (lock->cll_state == CLS_HELD)
                         /* nothing to do */
                         break;
@@ -1327,8 +1429,10 @@ int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
                         }
                 }
                 LASSERT(result != -ENOSYS);
-                if (result == 0)
+                if (result == 0) {
+                        LASSERT(lock->cll_state != CLS_INTRANSIT);
                         cl_lock_state_set(env, lock, CLS_HELD);
+                }
         } while (result == CLO_REPEAT);
         RETURN(result ?: lock->cll_error);
 }
@@ -1351,8 +1455,10 @@ int cl_wait(const struct lu_env *env, struct cl_lock *lock)
         cl_lock_mutex_get(env, lock);
 
         LINVRNT(cl_lock_invariant(env, lock));
-        LASSERT(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD);
+        LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+                 "Wrong state %d \n", lock->cll_state);
         LASSERT(lock->cll_holds > 0);
+        cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
 
         do {
                 result = cl_wait_try(env, lock);
@@ -1365,8 +1471,7 @@ int cl_wait(const struct lu_env *env, struct cl_lock *lock)
         } while (1);
         if (result < 0) {
                 cl_lock_user_del(env, lock);
-                if (result != -EINTR)
-                        cl_lock_error(env, lock, result);
+                cl_lock_error(env, lock, result);
                 cl_lock_lockdep_release(env, lock);
         }
         cl_lock_mutex_put(env, lock);
@@ -1421,6 +1526,7 @@ int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
         int result;
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
         /* don't allow object to change */
         LASSERT(obj == desc->cld_obj);
         LINVRNT(cl_lock_is_mutexed(lock));
@@ -1513,8 +1619,9 @@ EXPORT_SYMBOL(cl_lock_closure_build);
 int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
                       struct cl_lock_closure *closure)
 {
-        int result;
+        int result = 0;
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
         if (!cl_lock_mutex_try(env, lock)) {
                 /*
                  * If lock->cll_inclosure is not empty, lock is already in
@@ -1556,6 +1663,7 @@ void cl_lock_disclosure(const struct lu_env *env,
         struct cl_lock *scan;
         struct cl_lock *temp;
 
+        cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
         list_for_each_entry_safe(scan, temp, &closure->clc_list, cll_inclosure){
                 list_del_init(&scan->cll_inclosure);
                 cl_lock_mutex_put(env, scan);
@@ -1604,6 +1712,7 @@ void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
                      cl_lock_nr_mutexed(env) == 1));
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
         if (lock->cll_holds == 0)
                 cl_lock_delete0(env, lock);
         else
@@ -1628,6 +1737,7 @@ void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
         LINVRNT(cl_lock_invariant(env, lock));
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
         if (lock->cll_error == 0 && error != 0) {
                 lock->cll_error = error;
                 cl_lock_signal(env, lock);
@@ -1655,6 +1765,7 @@ void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
         LINVRNT(cl_lock_invariant(env, lock));
 
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
         if (lock->cll_holds == 0)
                 cl_lock_cancel0(env, lock);
         else
@@ -1685,11 +1796,15 @@ struct cl_lock *cl_lock_at_page(const struct lu_env *env, struct cl_object *obj,
         need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
                                     * not PHANTOM */
         need->cld_start = need->cld_end = page->cp_index;
+        need->cld_enq_flags = 0;
 
         spin_lock(&head->coh_lock_guard);
+        /* It is fine to match any group lock since there could be only one
+         * with a uniq gid and it conflicts with all other lock modes too */
         list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
                 if (scan != except &&
-                    cl_lock_ext_match(&scan->cll_descr, need) &&
+                    (scan->cll_descr.cld_mode == CLM_GROUP ||
+                    cl_lock_ext_match(&scan->cll_descr, need)) &&
                     scan->cll_state >= CLS_HELD &&
                     scan->cll_state < CLS_FREEING &&
                     /*
@@ -1813,9 +1928,12 @@ int cl_lock_page_out(const struct lu_env *env, struct cl_lock *lock,
         io->ci_obj = cl_object_top(descr->cld_obj);
         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
         if (result == 0) {
+                int nonblock = 1;
+
+restart:
                 cl_2queue_init(queue);
                 cl_page_gang_lookup(env, descr->cld_obj, io, descr->cld_start,
-                                    descr->cld_end, &queue->c2_qin);
+                                    descr->cld_end, &queue->c2_qin, nonblock);
                 page_count = queue->c2_qin.pl_nr;
                 if (page_count > 0) {
                         result = cl_page_list_unmap(env, io, &queue->c2_qin);
@@ -1837,6 +1955,11 @@ int cl_lock_page_out(const struct lu_env *env, struct cl_lock *lock,
                         cl_2queue_disown(env, io, queue);
                 }
                 cl_2queue_fini(env, queue);
+
+                if (nonblock) {
+                        nonblock = 0;
+                        goto restart;
+                }
         }
         cl_io_fini(env, io);
         RETURN(result);
@@ -1918,7 +2041,8 @@ static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
                 if (IS_ERR(lock))
                         break;
                 cl_lock_mutex_get(env, lock);
-                if (lock->cll_state < CLS_FREEING) {
+                if (lock->cll_state < CLS_FREEING &&
+                    !(lock->cll_flags & CLF_CANCELLED)) {
                         cl_lock_hold_mod(env, lock, +1);
                         lu_ref_add(&lock->cll_holders, scope, source);
                         lu_ref_add(&lock->cll_reference, scope, source);
@@ -1958,23 +2082,18 @@ EXPORT_SYMBOL(cl_lock_hold);
  */
 struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
                                 const struct cl_lock_descr *need,
-                                __u32 enqflags,
                                 const char *scope, const void *source)
 {
         struct cl_lock       *lock;
         const struct lu_fid  *fid;
         int                   rc;
         int                   iter;
-        int warn;
+        __u32                 enqflags = need->cld_enq_flags;
 
         ENTRY;
         fid = lu_object_fid(&io->ci_obj->co_lu);
         iter = 0;
         do {
-                warn = iter >= 16 && IS_PO2(iter);
-                CDEBUG(warn ? D_WARNING : D_DLMTRACE,
-                       DDESCR"@"DFID" %i %08x `%s'\n",
-                       PDESCR(need), PFID(fid), iter, enqflags, scope);
                 lock = cl_lock_hold_mutex(env, io, need, scope, source);
                 if (!IS_ERR(lock)) {
                         rc = cl_enqueue_locked(env, lock, io, enqflags);
@@ -1984,11 +2103,10 @@ struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
                                         cl_lock_lockdep_acquire(env,
                                                                 lock, enqflags);
                                         break;
-                                } else if (warn)
-                                        CL_LOCK_DEBUG(D_WARNING, env, lock,
-                                                      "got (see bug 17665)\n");
+                                }
                                 cl_unuse_locked(env, lock);
                         }
+                        cl_lock_trace(D_DLMTRACE, env, "enqueue failed", lock);
                         cl_lock_hold_release(env, lock, scope, source);
                         cl_lock_mutex_put(env, lock);
                         lu_ref_del(&lock->cll_reference, scope, source);
@@ -2045,6 +2163,7 @@ void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
 {
         LINVRNT(cl_lock_invariant(env, lock));
         ENTRY;
+        cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
         cl_lock_mutex_get(env, lock);
         cl_lock_hold_release(env, lock, scope, source);
         cl_lock_mutex_put(env, lock);
@@ -2077,37 +2196,18 @@ int cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
 }
 EXPORT_SYMBOL(cl_lock_user_del);
 
-/**
- * Check if two lock's mode are compatible.
- *
- * This returns true iff en-queuing \a lock2 won't cause cancellation of \a
- * lock1 even when these locks overlap.
- */
-int cl_lock_compatible(const struct cl_lock *lock1, const struct cl_lock *lock2)
-{
-        enum cl_lock_mode mode1;
-        enum cl_lock_mode mode2;
-
-        ENTRY;
-        mode1 = lock1->cll_descr.cld_mode;
-        mode2 = lock2->cll_descr.cld_mode;
-        RETURN(mode2 == CLM_PHANTOM ||
-               (mode1 == CLM_READ && mode2 == CLM_READ));
-}
-EXPORT_SYMBOL(cl_lock_compatible);
-
 const char *cl_lock_mode_name(const enum cl_lock_mode mode)
 {
         static const char *names[] = {
-                [CLM_PHANTOM] = "PHANTOM",
-                [CLM_READ]    = "READ",
-                [CLM_WRITE]   = "WRITE",
-                [CLM_GROUP]   = "GROUP"
+                [CLM_PHANTOM] = "P",
+                [CLM_READ]    = "R",
+                [CLM_WRITE]   = "W",
+                [CLM_GROUP]   = "G"
         };
         if (0 <= mode && mode < ARRAY_SIZE(names))
                 return names[mode];
         else
-                return "UNKNW";
+                return "U";
 }
 EXPORT_SYMBOL(cl_lock_mode_name);