b=11300

[fs/lustre-release.git] / lustre / ldlm / ldlm_extent.c
diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c

index 391d493..f1f88ce 100644 (file)
--- a/lustre/ldlm/ldlm_extent.c
+++ b/lustre/ldlm/ldlm_extent.c
@@ -5,20 +5,23 @@
   *   Author: Peter Braam <braam@clusterfs.com>
   *   Author: Phil Schwan <phil@clusterfs.com>
   *
- *   This file is part of Lustre, http://www.lustre.org.
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
   *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ *   You may have signed or agreed to another license before downloading
+ *   this software.  If so, you are bound by the terms and conditions
+ *   of that agreement, and the following does not apply to you.  See the
+ *   LICENSE file included with this distribution for more information.
   *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ *   If you did not agree to a different license, then this copy of Lustre
+ *   is open source software; you can redistribute it and/or modify it
+ *   under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
   *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *   In either case, Lustre is distributed in the hope that it will be
+ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   license text for more details.
   */
  
  #define DEBUG_SUBSYSTEM S_LDLM
@@ -26,22 +29,122 @@
  # include <liblustre.h>
  #endif
  
-#include <linux/lustre_dlm.h>
-#include <linux/obd_support.h>
-#include <linux/lustre_lib.h>
+#include <lustre_dlm.h>
+#include <obd_support.h>
+#include <lustre_lib.h>
  
  #include "ldlm_internal.h"
  
+#define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1)
+
+/* fixup the ldlm_extent after expanding */
+static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req,
+                                              struct ldlm_extent *new_ex,
+                                              int conflicting)
+{
+        ldlm_mode_t req_mode = req->l_req_mode;
+        __u64 req_start = req->l_req_extent.start;
+        __u64 req_end = req->l_req_extent.end;
+        __u64 req_align, mask;
+ 
+        if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) {
+                if (req_end < req_start + LDLM_MAX_GROWN_EXTENT)
+                        new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT,
+                                          new_ex->end);
+        }
+
+        if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) {
+                EXIT;
+                return;
+        }
+
+        /* we need to ensure that the lock extent is properly aligned to what
+         * the client requested.  We align it to the lowest-common denominator
+         * of the clients requested lock start and end alignment. */
+        mask = 0x1000ULL;
+        req_align = (req_end + 1) | req_start;
+        if (req_align != 0) {
+                while ((req_align & mask) == 0)
+                        mask <<= 1;
+        }
+        mask -= 1;
+        /* We can only shrink the lock, not grow it.
+         * This should never cause lock to be smaller than requested,
+         * since requested lock was already aligned on these boundaries. */
+        new_ex->start = ((new_ex->start - 1) | mask) + 1;
+        new_ex->end = ((new_ex->end + 1) & ~mask) - 1;
+        LASSERTF(new_ex->start <= req_start,
+                 "mask "LPX64" grant start "LPU64" req start "LPU64"\n",
+                 mask, new_ex->start, req_start);
+        LASSERTF(new_ex->end >= req_end,
+                 "mask "LPX64" grant end "LPU64" req end "LPU64"\n",
+                 mask, new_ex->end, req_end);
+}
+
+/* The purpose of this function is to return:
+ * - the maximum extent
+ * - containing the requested extent
+ * - and not overlapping existing conflicting extents outside the requested one
+ *
+ * Use interval tree to expand the lock extent for granted lock.
+ */
+static void ldlm_extent_internal_policy_granted(struct ldlm_lock *req,
+                                                struct ldlm_extent *new_ex)
+{
+        struct ldlm_resource *res = req->l_resource;
+        ldlm_mode_t req_mode = req->l_req_mode;
+        __u64 req_start = req->l_req_extent.start;
+        __u64 req_end = req->l_req_extent.end;
+        struct ldlm_interval_tree *tree;
+        struct interval_node_extent limiter = { new_ex->start, new_ex->end };
+        int conflicting = 0;
+        int idx;
+        ENTRY;
+
+        lockmode_verify(req_mode);
+
+        /* using interval tree to handle the ldlm extent granted locks */
+        for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                struct interval_node_extent ext = { req_start, req_end };
+
+                tree = &res->lr_itree[idx];
+                if (lockmode_compat(tree->lit_mode, req_mode))
+                        continue;
+
+                conflicting += tree->lit_size;
+                if (conflicting > 4)
+                        limiter.start = req_start;
+
+                if (interval_is_overlapped(tree->lit_root, &ext))
+                        printk("req_mode = %d, tree->lit_mode = %d, tree->lit_size = %d\n",
+                               req_mode, tree->lit_mode, tree->lit_size);
+                interval_expand(tree->lit_root, &ext, &limiter);
+                limiter.start = max(limiter.start, ext.start);
+                limiter.end = min(limiter.end, ext.end);
+                if (limiter.start == req_start && limiter.end == req_end)
+                        break;
+        }
+
+        new_ex->start = limiter.start;
+        new_ex->end = limiter.end;
+        LASSERT(new_ex->start <= req_start);
+        LASSERT(new_ex->end >= req_end);
+
+        ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
+        EXIT;
+}
+
  /* The purpose of this function is to return:
   * - the maximum extent
   * - containing the requested extent
   * - and not overlapping existing conflicting extents outside the requested one
   */
  static void
-ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req,
-                            struct ldlm_extent *new_ex)
+ldlm_extent_internal_policy_waiting(struct ldlm_lock *req,
+                                    struct ldlm_extent *new_ex)
  {
          struct list_head *tmp;
+        struct ldlm_resource *res = req->l_resource;
          ldlm_mode_t req_mode = req->l_req_mode;
          __u64 req_start = req->l_req_extent.start;
          __u64 req_end = req->l_req_extent.end;
@@ -50,13 +153,15 @@ ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req,
  
          lockmode_verify(req_mode);
  
-        list_for_each(tmp, queue) {
+        /* for waiting locks */
+        list_for_each(tmp, &res->lr_waiting) {
                  struct ldlm_lock *lock;
                  struct ldlm_extent *l_extent;
  
                  lock = list_entry(tmp, struct ldlm_lock, l_res_link);
                  l_extent = &lock->l_policy_data.l_extent;
  
+                /* We already hit the minimum requested size, search no more */
                  if (new_ex->start == req_start && new_ex->end == req_end) {
                          EXIT;
                          return;
@@ -80,16 +185,14 @@ ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req,
                          new_ex->start = req_start;
  
                  /* If lock doesn't overlap new_ex, skip it. */
-                if (l_extent->end < new_ex->start ||
-                    l_extent->start > new_ex->end)
+                if (!ldlm_extent_overlap(l_extent, new_ex))
                          continue;
  
                  /* Locks conflicting in requested extents and we can't satisfy
                   * both locks, so ignore it.  Either we will ping-pong this
                   * extent (we would regardless of what extent we granted) or
                   * lock is unused and it shouldn't limit our extent growth. */
-                if (lock->l_req_extent.end >= req_start &&
-                    lock->l_req_extent.start <= req_end)
+                if (ldlm_extent_overlap(&lock->l_req_extent,&req->l_req_extent))
                          continue;
  
                  /* We grow extents downwards only as far as they don't overlap
@@ -118,24 +221,35 @@ ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req,
                  }
          }
  
-#define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1)
-        if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) {
-                if (req_end < req_start + LDLM_MAX_GROWN_EXTENT)
-                        new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT,
-                                          new_ex->end);
-        }
+        ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
          EXIT;
  }
  
+
  /* In order to determine the largest possible extent we can grant, we need
   * to scan all of the queues. */
  static void ldlm_extent_policy(struct ldlm_resource *res,
                                 struct ldlm_lock *lock, int *flags)
  {
-        struct ldlm_extent new_ex = { .start = 0, .end = ~0};
-
-        ldlm_extent_internal_policy(&res->lr_granted, lock, &new_ex);
-        ldlm_extent_internal_policy(&res->lr_waiting, lock, &new_ex);
+        struct ldlm_extent new_ex = { .start = 0, .end = OBD_OBJECT_EOF };
+
+        if (lock->l_export == NULL)
+                /*
+                 * this is local lock taken by server (e.g., as a part of
+                 * OST-side locking, or unlink handling). Expansion doesn't
+                 * make a lot of sense for local locks, because they are
+                 * dropped immediately on operation completion and would only
+                 * conflict with other threads.
+                 */
+                return;
+
+        if (lock->l_policy_data.l_extent.start == 0 &&
+            lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF)
+                /* fast-path whole file locks */
+                return;
+
+        ldlm_extent_internal_policy_granted(lock, &new_ex);
+        ldlm_extent_internal_policy_waiting(lock, &new_ex);
  
          if (new_ex.start != lock->l_policy_data.l_extent.start ||
              new_ex.end != lock->l_policy_data.l_extent.end) {
@@ -145,6 +259,42 @@ static void ldlm_extent_policy(struct ldlm_resource *res,
          }
  }
  
+struct ldlm_extent_compat_args {
+        struct list_head *work_list;
+        struct ldlm_lock *lock;
+        ldlm_mode_t mode;
+        int *compat;
+};
+
+static enum interval_iter ldlm_extent_compat_cb(struct interval_node *n,
+                                                void *data)
+{
+        struct ldlm_extent_compat_args *priv = data;
+        struct ldlm_interval *node = to_ldlm_interval(n);
+        struct list_head *work_list = priv->work_list;
+        struct ldlm_lock *lock, *enq = priv->lock;
+        ldlm_mode_t mode = priv->mode;
+        ENTRY;
+
+        LASSERT(!list_empty(&node->li_group));
+
+        list_for_each_entry(lock, &node->li_group, l_sl_policy) {
+                /* interval tree is for granted lock */
+                LASSERTF(mode == lock->l_granted_mode,
+                         "mode = %s, lock->l_granted_mode = %s\n",
+                         ldlm_lockname[mode],
+                         ldlm_lockname[lock->l_granted_mode]);
+
+                if (lock->l_blocking_ast)
+                        ldlm_add_ast_work_item(lock, enq, work_list);
+        }
+
+        if (priv->compat)
+                *priv->compat = 0;
+
+        RETURN(INTERVAL_ITER_CONT);
+}
+
  /* Determine if the lock is compatible with all locks on the queue.
   * We stop walking the queue if we hit ourselves so we don't take
   * conflicting locks enqueued after us into accound, or we'd wait forever.
@@ -161,6 +311,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
  {
          struct list_head *tmp;
          struct ldlm_lock *lock;
+        struct ldlm_resource *res = req->l_resource;
          ldlm_mode_t req_mode = req->l_req_mode;
          __u64 req_start = req->l_req_extent.start;
          __u64 req_end = req->l_req_extent.end;
@@ -170,22 +321,87 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
  
          lockmode_verify(req_mode);
  
+        /* Using interval tree for granted lock */
+        if (queue == &res->lr_granted) {
+                struct ldlm_interval_tree *tree;
+                struct ldlm_extent_compat_args data = {.work_list = work_list,
+                                               .lock = req,
+                                               .compat = &compat };
+                struct interval_node_extent ex = { .start = req_start,
+                                                   .end = req_end };
+                int idx, rc;
+
+                for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                        tree = &res->lr_itree[idx];
+                        if (tree->lit_root == NULL) /* empty tree, skipped */
+                                continue;
+
+                        data.mode = tree->lit_mode;
+                        if (lockmode_compat(req_mode, tree->lit_mode)) {
+                                struct ldlm_interval *node;
+                                struct ldlm_extent *extent;
+
+                                if (req_mode != LCK_GROUP)
+                                        continue;
+
+                                /* group lock, grant it immediately if
+                                 * compatible */
+                                node = to_ldlm_interval(tree->lit_root);
+                                extent = ldlm_interval_extent(node);
+                                if (req->l_policy_data.l_extent.gid ==
+                                    extent->gid)
+                                        RETURN(2);
+                        }
+
+                        if (tree->lit_mode == LCK_GROUP) {
+                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+                                        compat = -EWOULDBLOCK;
+                                        goto destroylock;
+                                }
+
+                                *flags |= LDLM_FL_NO_TIMEOUT;
+                                if (!work_list)
+                                        RETURN(0);
+
+                                /* if work list is not NULL,add all
+                                   locks in the tree to work list */
+                                compat = 0;
+                                interval_iterate(tree->lit_root,
+                                                 ldlm_extent_compat_cb, &data);
+                                continue;
+                        }
+
+                        if (!work_list) {
+                                rc = interval_is_overlapped(tree->lit_root,&ex);
+                                if (rc)
+                                        RETURN(0);
+                        } else {
+                                interval_search(tree->lit_root, &ex,
+                                                ldlm_extent_compat_cb, &data);
+                                if (!list_empty(work_list) && compat)
+                                        compat = 0;
+                        }
+                }
+                RETURN(compat);
+        }
+
+        /* for waiting queue */
          list_for_each(tmp, queue) {
                  lock = list_entry(tmp, struct ldlm_lock, l_res_link);
  
                  if (req == lock)
                          RETURN(compat);
  
-                if (scan) {
+                if (unlikely(scan)) {
                          /* We only get here if we are queuing GROUP lock
                             and met some incompatible one. The main idea of this
                             code is to insert GROUP lock past compatible GROUP
                             lock in the waiting queue or if there is not any,
                             then in front of first non-GROUP lock */
                          if (lock->l_req_mode != LCK_GROUP) {
-                        /* Ok, we hit non-GROUP lock, there should be no
-                           more GROUP locks later on, queue in front of
-                           first non-GROUP lock */
+                                /* Ok, we hit non-GROUP lock, there should
+                                 * be no more GROUP locks later on, queue in
+                                 * front of first non-GROUP lock */
  
                                  ldlm_resource_insert_lock_after(lock, req);
                                  list_del_init(&lock->l_res_link);
@@ -195,8 +411,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                          if (req->l_policy_data.l_extent.gid ==
                               lock->l_policy_data.l_extent.gid) {
                                  /* found it */
-                                ldlm_resource_insert_lock_after(lock,
-                                                                req);
+                                ldlm_resource_insert_lock_after(lock, req);
                                  RETURN(0);
                          }
                          continue;
@@ -204,20 +419,54 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
  
                  /* locks are compatible, overlap doesn't matter */
                  if (lockmode_compat(lock->l_req_mode, req_mode)) {
+                        if (req_mode == LCK_PR &&
+                            ((lock->l_policy_data.l_extent.start <=
+                             req->l_policy_data.l_extent.start) &&
+                             (lock->l_policy_data.l_extent.end >=
+                              req->l_policy_data.l_extent.end))) {
+                                /* If we met a PR lock just like us or wider,
+                                   and nobody down the list conflicted with
+                                   it, that means we can skip processing of
+                                   the rest of the list and safely place
+                                   ourselves at the end of the list, or grant
+                                   (dependent if we met an conflicting locks
+                                   before in the list).
+                                   In case of 1st enqueue only we continue
+                                   traversing if there is something conflicting
+                                   down the list because we need to make sure
+                                   that something is marked as AST_SENT as well,
+                                   in cse of empy worklist we would exit on
+                                   first conflict met. */
+                                /* There IS a case where such flag is
+                                   not set for a lock, yet it blocks
+                                   something. Luckily for us this is
+                                   only during destroy, so lock is
+                                   exclusive. So here we are safe */
+                                if (!(lock->l_flags & LDLM_FL_AST_SENT)) {
+                                        RETURN(compat);
+                                }
+                        }
+
                          /* non-group locks are compatible, overlap doesn't
                             matter */
-                        if (req_mode != LCK_GROUP)
+                        if (likely(req_mode != LCK_GROUP))
                                  continue;
-                                
+
                          /* If we are trying to get a GROUP lock and there is
                             another one of this kind, we need to compare gid */
                          if (req->l_policy_data.l_extent.gid ==
                              lock->l_policy_data.l_extent.gid) {
+                                /* If existing lock with matched gid is granted,
+                                   we grant new one too. */
                                  if (lock->l_req_mode == lock->l_granted_mode)
                                          RETURN(2);
  
-                                /* If we are in nonblocking mode - return
-                                   immediately */
+                                /* Otherwise we are scanning queue of waiting
+                                 * locks and it means current request would
+                                 * block along with existing lock (that is
+                                 * already blocked.
+                                 * If we are in nonblocking mode - return
+                                 * immediately */
                                  if (*flags & LDLM_FL_BLOCK_NOWAIT) {
                                          compat = -EWOULDBLOCK;
                                          goto destroylock;
@@ -237,8 +486,8 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                          }
                  }
  
-                if (req_mode == LCK_GROUP &&
-                    (lock->l_req_mode != lock->l_granted_mode)) {
+                if (unlikely(req_mode == LCK_GROUP &&
+                    (lock->l_req_mode != lock->l_granted_mode))) {
                          scan = 1;
                          compat = 0;
                          if (lock->l_req_mode != LCK_GROUP) {
@@ -260,9 +509,9 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                          continue;
                  }
  
-                if (lock->l_req_mode == LCK_GROUP) {
-                        /* If compared lock is GROUP, then requested is PR/PW/=>
-                         * this is not compatible; extent range does not
+                if (unlikely(lock->l_req_mode == LCK_GROUP)) {
+                        /* If compared lock is GROUP, then requested is PR/PW/
+                         * so this is not compatible; extent range does not
                           * matter */
                          if (*flags & LDLM_FL_BLOCK_NOWAIT) {
                                  compat = -EWOULDBLOCK;
@@ -272,7 +521,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                          }
                  } else if (lock->l_policy_data.l_extent.end < req_start ||
                             lock->l_policy_data.l_extent.start > req_end) {
-                        /* if a non grouplock doesn't overlap skip it */
+                        /* if a non group lock doesn't overlap skip it */
                          continue;
                  }
  
@@ -284,32 +533,31 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
                          ldlm_add_ast_work_item(lock, req, work_list);
          }
  
-        return(compat);
+        RETURN(compat);
  destroylock:
          list_del_init(&req->l_res_link);
-        ldlm_lock_destroy(req);
+        ldlm_lock_destroy_nolock(req);
          *err = compat;
          RETURN(compat);
  }
  
  /* If first_enq is 0 (ie, called from ldlm_reprocess_queue):
    *   - blocking ASTs have already been sent
-  *   - the caller has already initialized req->lr_tmp
    *   - must call this function with the ns lock held
    *
    * If first_enq is 1 (ie, called from ldlm_lock_enqueue):
    *   - blocking ASTs have not been sent
-  *   - the caller has NOT initialized req->lr_tmp, so we must
    *   - must call this function with the ns lock held once */
  int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                               ldlm_error_t *err, struct list_head *work_list)
  {
          struct ldlm_resource *res = lock->l_resource;
-        struct list_head rpc_list = LIST_HEAD_INIT(rpc_list);
+        struct list_head rpc_list = CFS_LIST_HEAD_INIT(rpc_list);
          int rc, rc2;
          ENTRY;
  
          LASSERT(list_empty(&res->lr_converting));
+        check_res_locked(res);
          *err = ELDLM_OK;
  
          if (!first_enq) {
@@ -329,7 +577,9 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                          RETURN(LDLM_ITER_STOP);
  
                  ldlm_resource_unlink_lock(lock);
-                ldlm_extent_policy(res, lock, flags);
+
+                if (!OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_EVICT_RACE))
+                        ldlm_extent_policy(res, lock, flags);
                  ldlm_grant_lock(lock, work_list);
                  RETURN(LDLM_ITER_CONTINUE);
          }
@@ -338,9 +588,8 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
          rc = ldlm_extent_compat_queue(&res->lr_granted, lock, flags, err, &rpc_list);
          if (rc < 0)
                  GOTO(out, rc); /* lock was destroyed */
-        if (rc == 2) {
+        if (rc == 2)
                  goto grant;
-        }
  
          rc2 = ldlm_extent_compat_queue(&res->lr_waiting, lock, flags, err, &rpc_list);
          if (rc2 < 0)
@@ -361,11 +610,31 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                  if (list_empty(&lock->l_res_link))
                          ldlm_resource_add_lock(res, &res->lr_waiting, lock);
                  unlock_res(res);
-                rc = ldlm_run_bl_ast_work(&rpc_list);
+                rc = ldlm_run_ast_work(&rpc_list, LDLM_WORK_BL_AST);
                  lock_res(res);
-                if (rc == -ERESTART)
+
+                if (rc == -ERESTART) {
+                        /* lock was granted while resource was unlocked. */
+                        if (lock->l_granted_mode == lock->l_req_mode) {
+                                /* bug 11300: if the lock has been granted,
+                                 * break earlier because otherwise, we will go
+                                 * to restart and ldlm_resource_unlink will be
+                                 * called and it causes the interval node to be
+                                 * freed. Then we will fail at 
+                                 * ldlm_extent_add_lock() */
+                                *flags &= ~(LDLM_FL_BLOCK_GRANTED | LDLM_FL_BLOCK_CONV |
+                                            LDLM_FL_BLOCK_WAIT);
+                                GOTO(out, rc = 0);
+                        }
+
                          GOTO(restart, -ERESTART);
+                }
+
                  *flags |= LDLM_FL_BLOCK_GRANTED;
+                /* this way we force client to wait for the lock
+                 * endlessly once the lock is enqueued -bzzz */
+                *flags |= LDLM_FL_NO_TIMEOUT;
+
          }
          rc = 0;
  out:
@@ -374,7 +643,7 @@ out:
  
  /* When a lock is cancelled by a client, the KMS may undergo change if this
   * is the "highest lock".  This function returns the new KMS value.
- * Caller must hold ns_lock already. 
+ * Caller must hold ns_lock already.
   *
   * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! */
  __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
@@ -388,7 +657,6 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
          /* don't let another thread in ldlm_extent_shift_kms race in
           * just after we finish and take our lock into account in its
           * calculation of the kms */
-
          lock->l_flags |= LDLM_FL_KMS_IGNORE;
  
          list_for_each(tmp, &res->lr_granted) {
@@ -409,3 +677,124 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
  
          RETURN(kms);
  }
+
+cfs_mem_cache_t *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+        struct ldlm_interval *node;
+        ENTRY;
+
+        LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+        OBD_SLAB_ALLOC(node, ldlm_interval_slab, CFS_ALLOC_IO, sizeof(*node));
+        if (node == NULL)
+                RETURN(NULL);
+
+        CFS_INIT_LIST_HEAD(&node->li_group);
+        ldlm_interval_attach(node, lock);
+        RETURN(node);
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+        if (node) {
+                LASSERT(list_empty(&node->li_group));
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+        }
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+                          struct ldlm_lock *l)
+{
+        LASSERT(l->l_tree_node == NULL);
+        LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+        list_add_tail(&l->l_sl_policy, &n->li_group);
+        l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+        struct ldlm_interval *n = l->l_tree_node;
+
+        if (n == NULL)
+                return NULL;
+
+        LASSERT(!list_empty(&n->li_group));
+        l->l_tree_node = NULL;
+        list_del_init(&l->l_sl_policy);
+
+        return (list_empty(&n->li_group) ? n : NULL);
+}
+
+static inline int lock_mode_to_index(ldlm_mode_t mode)
+{
+        int index;
+
+        LASSERT(mode != 0);
+        LASSERT(IS_PO2(mode));
+        for (index = -1; mode; index++, mode >>= 1) ;
+        LASSERT(index < LCK_MODE_NUM);
+        return index;
+}
+
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+                          struct ldlm_lock *lock)
+{
+        struct interval_node *found, **root;
+        struct ldlm_interval *node;
+        struct ldlm_extent *extent;
+        int idx;
+
+        LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+        node = lock->l_tree_node;
+        LASSERT(node != NULL);
+
+        idx = lock_mode_to_index(lock->l_granted_mode);
+        LASSERT(lock->l_granted_mode == 1 << idx);
+        LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+        /* node extent initialize */
+        extent = &lock->l_policy_data.l_extent;
+        interval_set(&node->li_node, extent->start, extent->end);
+
+        root = &res->lr_itree[idx].lit_root;
+        found = interval_insert(&node->li_node, root);
+        if (found) { /* The policy group found. */
+                struct ldlm_interval *tmp = ldlm_interval_detach(lock);
+                LASSERT(tmp != NULL);
+                ldlm_interval_free(tmp);
+                ldlm_interval_attach(to_ldlm_interval(found), lock);
+        }
+        res->lr_itree[idx].lit_size++;
+
+        /* even though we use interval tree to manage the extent lock, we also
+         * add the locks into grant list, for debug purpose, .. */
+        ldlm_resource_add_lock(res, &res->lr_granted, lock);
+}
+
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+        struct ldlm_resource *res = lock->l_resource;
+        struct ldlm_interval *node;
+        struct ldlm_interval_tree *tree;
+        int idx;
+
+        if (lock->l_granted_mode != lock->l_req_mode)
+                return;
+
+        LASSERT(lock->l_tree_node != NULL);
+        idx = lock_mode_to_index(lock->l_granted_mode);
+        LASSERT(lock->l_granted_mode == 1 << idx);
+        tree = &res->lr_itree[idx];
+
+        LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+        tree->lit_size--;
+        node = ldlm_interval_detach(lock);
+        if (node) {
+                interval_erase(&node->li_node, &tree->lit_root);
+                ldlm_interval_free(node);
+        }
+}