Whamcloud - gitweb
b=11300
authorjxiong <jxiong>
Thu, 6 Mar 2008 04:20:32 +0000 (04:20 +0000)
committerjxiong <jxiong>
Thu, 6 Mar 2008 04:20:32 +0000 (04:20 +0000)
r=adilger,oleg,nikita,vitaly

Using interval tree manages the extent locks to accelerate the evicting of contended locks.

20 files changed:
lustre/include/Makefile.am
lustre/include/interval_tree.h [new file with mode: 0644]
lustre/include/lustre/lustre_idl.h
lustre/include/lustre_dlm.h
lustre/include/obd_support.h
lustre/ldlm/Makefile.am
lustre/ldlm/interval_tree.c [new file with mode: 0644]
lustre/ldlm/ldlm_extent.c
lustre/ldlm/ldlm_internal.h
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_resource.c
lustre/obdfilter/filter.c
lustre/ptlrpc/Makefile.in
lustre/ptlrpc/autoMakefile.am
lustre/ptlrpc/wiretest.c
lustre/tests/Makefile.am
lustre/tests/it_test.c [new file with mode: 0644]
lustre/tests/sanityN.sh
lustre/utils/wiretest.c

index 27e62fc..7c99903 100644 (file)
@@ -16,4 +16,4 @@ EXTRA_DIST = ioctl.h liblustre.h lprocfs_status.h lustre_cfg.h        \
             obd_ost.h obd_support.h lustre_ver.h lu_object.h lu_time.h  \
              md_object.h dt_object.h lustre_param.h lustre_mdt.h \
              lustre_fid.h lustre_fld.h lustre_req_layout.h lustre_capa.h \
             obd_ost.h obd_support.h lustre_ver.h lu_object.h lu_time.h  \
              md_object.h dt_object.h lustre_param.h lustre_mdt.h \
              lustre_fid.h lustre_fld.h lustre_req_layout.h lustre_capa.h \
-             lustre_idmap.h lustre_eacl.h
+             lustre_idmap.h lustre_eacl.h interval_tree.h
diff --git a/lustre/include/interval_tree.h b/lustre/include/interval_tree.h
new file mode 100644 (file)
index 0000000..41436c1
--- /dev/null
@@ -0,0 +1,106 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * (visit-tags-table FILE)
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002, 2007 Cluster File Systems, Inc.
+ *   Author: Huang Wei <huangwei@clusterfs.com>
+ *   Author: Jay Xiong <jinshan.xiong@sun.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   You may have signed or agreed to another license before downloading
+ *   this software.  If so, you are bound by the terms and conditions
+ *   of that agreement, and the following does not apply to you.  See the
+ *   LICENSE file included with this distribution for more information.
+ *
+ *   If you did not agree to a different license, then this copy of Lustre
+ *   is open source software; you can redistribute it and/or modify it
+ *   under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   In either case, Lustre is distributed in the hope that it will be
+ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   license text for more details.
+ */
+
+#ifndef _INTERVAL_H__
+#define _INTERVAL_H__
+
+#include <libcfs/types.h>  /* __u8, __u64 etc. */
+#include <libcfs/kp30.h>   /* LASSERT. */
+
+struct interval_node {
+        struct interval_node   *in_left;
+        struct interval_node   *in_right;
+        struct interval_node   *in_parent;
+        __u8                    in_color;
+        __u8                    res1[7];  /* tags, 8-bytes aligned */
+        __u64                   in_max_high;
+        struct interval_node_extent {
+                __u64 start;
+                __u64 end;
+        } in_extent;
+};
+
+enum interval_iter {
+        INTERVAL_ITER_CONT = 1,
+        INTERVAL_ITER_STOP = 2
+};
+
+static inline __u64 interval_low(struct interval_node *node)
+{
+        return node->in_extent.start;
+}
+
+static inline __u64 interval_high(struct interval_node *node)
+{
+        return node->in_extent.end;
+}
+
+static inline void interval_set(struct interval_node *node,
+                                __u64 start, __u64 end)
+{
+        LASSERT(start <= end);
+        node->in_extent.start = start;
+        node->in_extent.end = end;
+        node->in_max_high = end;
+}
+
+/* Rules to write an interval callback.
+ *  - the callback returns INTERVAL_ITER_STOP when it thinks the iteration
+ *    should be stopped. It will then cause the iteration function to return
+ *    immediately with return value INTERVAL_ITER_STOP.
+ *  - callbacks for interval_iterate and interval_iterate_reverse: Every 
+ *    nodes in the tree will be set to @node before the callback being called
+ *  - callback for interval_search: Only overlapped node will be set to @node
+ *    before the callback being called.
+ */
+typedef enum interval_iter (*interval_callback_t)(struct interval_node *node,
+                                                  void *args);
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                      struct interval_node **root);
+void interval_erase(struct interval_node *node, struct interval_node **root);
+
+/* Search the extents in the tree and call @func for each overlapped
+ * extents. */
+enum interval_iter interval_search(struct interval_node *root,
+                                   struct interval_node_extent *ex,
+                                   interval_callback_t func, void *data);
+
+/* Iterate every node in the tree - by reverse order or regular order. */
+enum interval_iter interval_iterate(struct interval_node *root, 
+                                    interval_callback_t func, void *data);
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                    interval_callback_t func,void *data);
+
+void interval_expand(struct interval_node *root, 
+                     struct interval_node_extent *ext,
+                     struct interval_node_extent *limiter);
+int interval_is_overlapped(struct interval_node *root, 
+                           struct interval_node_extent *ex);
+struct interval_node *interval_find(struct interval_node *root,
+                                    struct interval_node_extent *ex);
+#endif
index be6b8d1..2a8786f 100644 (file)
@@ -1652,6 +1652,8 @@ typedef enum {
         LCK_MAXMODE
 } ldlm_mode_t;
 
         LCK_MAXMODE
 } ldlm_mode_t;
 
+#define LCK_MODE_NUM    7
+
 typedef enum {
         LDLM_PLAIN     = 10,
         LDLM_EXTENT    = 11,
 typedef enum {
         LDLM_PLAIN     = 10,
         LDLM_EXTENT    = 11,
@@ -1668,6 +1670,12 @@ struct ldlm_extent {
         __u64 gid;
 };
 
         __u64 gid;
 };
 
+static inline int ldlm_extent_overlap(struct ldlm_extent *ex1,
+                                      struct ldlm_extent *ex2)
+{
+        return (ex1->start <= ex2->end) && (ex2->start <= ex1->end);
+}
+
 struct ldlm_inodebits {
         __u64 bits;
 };
 struct ldlm_inodebits {
         __u64 bits;
 };
index 7789517..48722d5 100644 (file)
@@ -21,6 +21,7 @@
 #include <lustre_import.h>
 #include <lustre_handles.h>
 #include <lustre_export.h> /* for obd_export, for LDLM_DEBUG */
 #include <lustre_import.h>
 #include <lustre_handles.h>
 #include <lustre_export.h> /* for obd_export, for LDLM_DEBUG */
+#include <interval_tree.h> /* for interval_node{}, ldlm_extent */
 
 struct obd_ops;
 struct obd_device;
 
 struct obd_ops;
 struct obd_device;
@@ -365,6 +366,23 @@ typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags,
                                         void *data);
 typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
 
                                         void *data);
 typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data);
 
+/* Interval node data for each LDLM_EXTENT lock */
+struct ldlm_interval {
+        struct interval_node li_node;   /* node for tree mgmt */
+        struct list_head     li_group;  /* the locks which have the same 
+                                         * policy - group of the policy */
+};
+#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node)
+
+/* the interval tree must be accessed inside the resource lock. */
+struct ldlm_interval_tree {
+        /* tree size, this variable is used to count
+         * granted PW locks in ldlm_extent_policy()*/
+        int                   lit_size;
+        ldlm_mode_t           lit_mode; /* lock mode */
+        struct interval_node *lit_root; /* actually ldlm_interval */
+};
+
 struct ldlm_lock {
         struct portals_handle l_handle; // must be first in the structure
         atomic_t              l_refc;
 struct ldlm_lock {
         struct portals_handle l_handle; // must be first in the structure
         atomic_t              l_refc;
@@ -385,6 +403,8 @@ struct ldlm_lock {
         struct list_head      l_sl_mode;        // skip pointer for request mode
         struct list_head      l_sl_policy;      // skip pointer for inodebits
 
         struct list_head      l_sl_mode;        // skip pointer for request mode
         struct list_head      l_sl_policy;      // skip pointer for inodebits
 
+        struct ldlm_interval *l_tree_node;      /* tree node for ldlm_extent */
+
         /* protected by led_lock */
         struct list_head      l_export_chain; // per-export chain of locks
 
         /* protected by led_lock */
         struct list_head      l_export_chain; // per-export chain of locks
 
@@ -458,6 +478,8 @@ struct ldlm_resource {
         struct ldlm_res_id     lr_name;
         atomic_t               lr_refcount;
 
         struct ldlm_res_id     lr_name;
         atomic_t               lr_refcount;
 
+        struct ldlm_interval_tree lr_itree[LCK_MODE_NUM];  /* interval trees*/
+
         /* Server-side-only lock value block elements */
         struct semaphore       lr_lvb_sem;
         __u32                  lr_lvb_len;
         /* Server-side-only lock value block elements */
         struct semaphore       lr_lvb_sem;
         __u32                  lr_lvb_len;
index d029d99..8f0143f 100644 (file)
@@ -197,6 +197,11 @@ int __obd_fail_check_set(__u32 id, __u32 value, int set);
 #define OBD_FAIL_LDLM_GLIMPSE            0x30f
 #define OBD_FAIL_LDLM_CANCEL_RACE        0x310
 #define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
 #define OBD_FAIL_LDLM_GLIMPSE            0x30f
 #define OBD_FAIL_LDLM_CANCEL_RACE        0x310
 #define OBD_FAIL_LDLM_CANCEL_EVICT_RACE  0x311
+/* 
+#define OBD_FAIL_LDLM_PAUSE_CANCEL       0x312
+#define OBD_FAIL_LDLM_CLOSE_THREAD       0x313
+*/
+#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE  0x314
 
 #define OBD_FAIL_OSC                     0x400
 #define OBD_FAIL_OSC_BRW_READ_BULK       0x401
 
 #define OBD_FAIL_OSC                     0x400
 #define OBD_FAIL_OSC_BRW_READ_BULK       0x401
index 7beda3d..b5993fc 100644 (file)
@@ -10,4 +10,5 @@
 MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
 DIST_SOURCES = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \
        ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c         \
 MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
 DIST_SOURCES = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \
        ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c         \
-       ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c
+       ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c        \
+       interval_tree.c
diff --git a/lustre/ldlm/interval_tree.c b/lustre/ldlm/interval_tree.c
new file mode 100644 (file)
index 0000000..bedf5b3
--- /dev/null
@@ -0,0 +1,752 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Interval tree library used by ldlm extent lock code
+ *
+ *  Copyright (c) 2007 Cluster File Systems, Inc.
+ *   Author: Huang Wei <huangwei@clusterfs.com>
+ *   Author: Jay Xiong <jinshan.xiong@sun.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   You may have signed or agreed to another license before downloading
+ *   this software.  If so, you are bound by the terms and conditions
+ *   of that agreement, and the following does not apply to you.  See the
+ *   LICENSE file included with this distribution for more information.
+ *
+ *   If you did not agree to a different license, then this copy of Lustre
+ *   is open source software; you can redistribute it and/or modify it
+ *   under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   In either case, Lustre is distributed in the hope that it will be
+ *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   license text for more details.
+ */
+#ifdef __KERNEL__
+# include <lustre_dlm.h>
+#else
+# include <liblustre.h>
+# include <libcfs/kp30.h>
+#endif
+#include <obd_support.h>
+#include <interval_tree.h>
+
+enum {
+        INTERVAL_RED = 0,
+        INTERVAL_BLACK = 1
+};
+
+static inline int node_is_left_child(struct interval_node *node)
+{
+        LASSERT(node->in_parent != NULL);
+        return node == node->in_parent->in_left;
+}
+
+static inline int node_is_right_child(struct interval_node *node)
+{
+        LASSERT(node->in_parent != NULL);
+        return node == node->in_parent->in_right;
+}
+
+static inline int node_is_red(struct interval_node *node)
+{
+        return node->in_color == INTERVAL_RED;
+}
+
+static inline int node_is_black(struct interval_node *node)
+{
+        return node->in_color == INTERVAL_BLACK;
+}
+
+static inline int extent_compare(struct interval_node_extent *e1,
+                                 struct interval_node_extent *e2)
+{
+        int rc;
+        if (e1->start == e2->start) {
+                if (e1->end < e2->end)
+                        rc = -1;
+                else if (e1->end > e2->end)
+                        rc = 1;
+                else
+                        rc = 0;
+        } else {
+                if (e1->start < e2->start)
+                        rc = -1;
+                else
+                        rc = 1;
+        }
+        return rc;
+}
+
+static inline int extent_equal(struct interval_node_extent *e1,
+                               struct interval_node_extent *e2)
+{
+        return (e1->start == e2->start) && (e1->end == e2->end);
+}
+
+static inline int extent_overlapped(struct interval_node_extent *e1, 
+                                    struct interval_node_extent *e2)
+{
+        return (e1->start <= e2->end) && (e2->start <= e1->end);
+}
+
+static inline int node_compare(struct interval_node *n1,
+                               struct interval_node *n2)
+{
+        return extent_compare(&n1->in_extent, &n2->in_extent);
+}
+
+static inline int node_equal(struct interval_node *n1,
+                             struct interval_node *n2)
+{
+        return extent_equal(&n1->in_extent, &n2->in_extent);
+}
+
+static inline __u64 max_u64(__u64 x, __u64 y)
+{
+        return x > y ? x : y;
+}
+
+static inline __u64 min_u64(__u64 x, __u64 y)
+{
+        return x < y ? x : y;
+}
+
+#define interval_for_each(node, root)                   \
+for (node = interval_first(root); node != NULL;         \
+     node = interval_next(node))
+
+#define interval_for_each_reverse(node, root)           \
+for (node = interval_last(root); node != NULL;          \
+     node = interval_prev(node))
+
+static struct interval_node *interval_first(struct interval_node *node)
+{
+        ENTRY;
+
+        if (!node)
+                RETURN(NULL);
+        while (node->in_left)
+                node = node->in_left;
+        RETURN(node);
+}
+
+static struct interval_node *interval_last(struct interval_node *node)
+{
+        ENTRY;
+
+        if (!node)
+                RETURN(NULL);
+        while (node->in_right)
+                node = node->in_right;
+        RETURN(node);
+}
+
+static struct interval_node *interval_next(struct interval_node *node)
+{
+        ENTRY;
+
+        if (!node)
+                RETURN(NULL);
+        if (node->in_right)
+                RETURN(interval_first(node->in_right));
+        while (node->in_parent && node_is_right_child(node))
+                node = node->in_parent;
+        RETURN(node->in_parent);
+}
+
+static struct interval_node *interval_prev(struct interval_node *node)
+{
+        ENTRY;
+
+        if (!node)
+                RETURN(NULL);
+
+        if (node->in_left)
+                RETURN(interval_last(node->in_left));
+
+        while (node->in_parent && node_is_left_child(node))
+                node = node->in_parent;
+
+        RETURN(node->in_parent);
+}
+
+enum interval_iter interval_iterate(struct interval_node *root,
+                                    interval_callback_t func,
+                                    void *data)
+{
+        struct interval_node *node;
+        enum interval_iter rc = INTERVAL_ITER_CONT;
+        ENTRY;
+        
+        interval_for_each(node, root) {
+                rc = func(node, data);
+                if (rc == INTERVAL_ITER_STOP)
+                        break;
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate);
+
+enum interval_iter interval_iterate_reverse(struct interval_node *root,
+                                            interval_callback_t func,
+                                            void *data)
+{
+        struct interval_node *node;
+        enum interval_iter rc = INTERVAL_ITER_CONT;
+        ENTRY;
+        
+        interval_for_each_reverse(node, root) {
+                rc = func(node, data);
+                if (rc == INTERVAL_ITER_STOP)
+                        break;
+        }
+
+        RETURN(rc);
+}
+EXPORT_SYMBOL(interval_iterate_reverse);
+
+/* try to find a node with same interval in the tree,
+ * if found, return the pointer to the node, otherwise return NULL*/
+struct interval_node *interval_find(struct interval_node *root,
+                                    struct interval_node_extent *ex)
+{
+        struct interval_node *walk = root;
+        int rc;
+        ENTRY;
+
+        while (walk) {
+                rc = extent_compare(ex, &walk->in_extent);
+                if (rc == 0)
+                        break;
+                else if (rc < 0)
+                        walk = walk->in_left;
+                else
+                        walk = walk->in_right;
+        }
+
+        RETURN(walk);
+}
+EXPORT_SYMBOL(interval_find);
+
+static void __rotate_change_maxhigh(struct interval_node *node,
+                                    struct interval_node *rotate)
+{
+        __u64 left_max, right_max;
+
+        rotate->in_max_high = node->in_max_high;
+        left_max = node->in_left ? node->in_left->in_max_high : 0;
+        right_max = node->in_right ? node->in_right->in_max_high : 0;
+        node->in_max_high  = max_u64(interval_high(node),
+                                     max_u64(left_max,right_max));
+}
+
+/* The left rotation "pivots" around the link from node to node->right, and
+ * - node will be linked to node->right's left child, and
+ * - node->right's left child will be linked to node's right child.  */
+static void __rotate_left(struct interval_node *node,
+                          struct interval_node **root)
+{
+        struct interval_node *right = node->in_right;
+        struct interval_node *parent = node->in_parent;
+
+        node->in_right = right->in_left;
+        if (node->in_right)
+                right->in_left->in_parent = node;
+
+        right->in_left = node;
+        right->in_parent = parent;
+        if (parent) {
+                if (node_is_left_child(node))
+                        parent->in_left = right;
+                else
+                        parent->in_right = right;
+        } else {
+                *root = right;
+        }
+        node->in_parent = right;
+
+        /* update max_high for node and right */
+        __rotate_change_maxhigh(node, right);
+}
+
+/* The right rotation "pivots" around the link from node to node->left, and
+ * - node will be linked to node->left's right child, and
+ * - node->left's right child will be linked to node's left child.  */
+static void __rotate_right(struct interval_node *node,
+                           struct interval_node **root)
+{
+        struct interval_node *left = node->in_left;
+        struct interval_node *parent = node->in_parent;
+
+        node->in_left = left->in_right;
+        if (node->in_left)
+                left->in_right->in_parent = node;
+        left->in_right = node;
+
+        left->in_parent = parent;
+        if (parent) {
+                if (node_is_right_child(node))
+                        parent->in_right = left;
+                else
+                        parent->in_left = left;
+        } else {
+                *root = left;
+        }
+        node->in_parent = left;
+
+        /* update max_high for node and left */
+        __rotate_change_maxhigh(node, left);
+}
+
+#define interval_swap(a, b) do {                        \
+        struct interval_node *c = a; a = b; b = c;      \
+} while (0)
+
+/*
+ * Operations INSERT and DELETE, when run on a tree with n keys, 
+ * take O(logN) time.Because they modify the tree, the result 
+ * may violate the red-black properties.To restore these properties, 
+ * we must change the colors of some of the nodes in the tree 
+ * and also change the pointer structure.
+ */
+static void interval_insert_color(struct interval_node *node,
+                                  struct interval_node **root)
+{
+        struct interval_node *parent, *gparent;
+        ENTRY;
+
+        while ((parent = node->in_parent) && node_is_red(parent)) {
+                gparent = parent->in_parent;
+                /* Parent is RED, so gparent must not be NULL */
+                if (node_is_left_child(parent)) {
+                        struct interval_node *uncle;
+                        uncle = gparent->in_right;
+                        if (uncle && node_is_red(uncle)) {
+                                uncle->in_color = INTERVAL_BLACK;
+                                parent->in_color = INTERVAL_BLACK;
+                                gparent->in_color = INTERVAL_RED;
+                                node = gparent;
+                                continue;
+                        }
+
+                        if (parent->in_right == node) {
+                                __rotate_left(parent, root);
+                                interval_swap(node, parent);
+                        }
+
+                        parent->in_color = INTERVAL_BLACK;
+                        gparent->in_color = INTERVAL_RED;
+                        __rotate_right(gparent, root);
+                } else {
+                        struct interval_node *uncle;
+                        uncle = gparent->in_left;
+                        if (uncle && node_is_red(uncle)) {
+                                uncle->in_color = INTERVAL_BLACK;
+                                parent->in_color = INTERVAL_BLACK;
+                                gparent->in_color = INTERVAL_RED;
+                                node = gparent;
+                                continue;
+                        }
+
+                        if (node_is_left_child(node)) {
+                                __rotate_right(parent, root);
+                                interval_swap(node, parent);
+                        }
+
+                        parent->in_color = INTERVAL_BLACK;
+                        gparent->in_color = INTERVAL_RED;
+                        __rotate_left(gparent, root);
+                }
+        }
+
+        (*root)->in_color = INTERVAL_BLACK;
+        EXIT;
+}
+
+struct interval_node *interval_insert(struct interval_node *node,
+                                      struct interval_node **root)
+                     
+{
+        struct interval_node **p, *parent = NULL;
+        ENTRY;
+
+        p = root;
+        while (*p) {
+                parent = *p;
+                if (node_equal(parent, node))
+                        RETURN(parent);
+
+                /* max_high field must be updated after each iteration */
+                if (parent->in_max_high < interval_high(node))
+                        parent->in_max_high = interval_high(node);
+
+                if (node_compare(node, parent) < 0)
+                        p = &parent->in_left;
+                else 
+                        p = &parent->in_right;
+        }
+
+        /* link node into the tree */
+        node->in_parent = parent;
+        node->in_color = INTERVAL_RED;
+        node->in_left = node->in_right = NULL;
+        *p = node;
+
+        interval_insert_color(node, root);
+
+        RETURN(NULL);
+}
+EXPORT_SYMBOL(interval_insert);
+
+static inline int node_is_black_or_0(struct interval_node *node)
+{
+        return !node || node_is_black(node);
+}
+
+static void interval_erase_color(struct interval_node *node,
+                                 struct interval_node *parent,
+                                 struct interval_node **root)
+{
+        struct interval_node *tmp;
+        ENTRY;
+
+        while (node_is_black_or_0(node) && node != *root) {
+                if (parent->in_left == node) {
+                        tmp = parent->in_right;
+                        if (node_is_red(tmp)) {
+                                tmp->in_color = INTERVAL_BLACK;
+                                parent->in_color = INTERVAL_RED;
+                                __rotate_left(parent, root);
+                                tmp = parent->in_right;
+                        }
+                        if (node_is_black_or_0(tmp->in_left) &&
+                            node_is_black_or_0(tmp->in_right)) {
+                                tmp->in_color = INTERVAL_RED;
+                                node = parent;
+                                parent = node->in_parent;
+                        } else {
+                                if (node_is_black_or_0(tmp->in_right)) {
+                                        struct interval_node *o_left;
+                                        if ((o_left = tmp->in_left))
+                                             o_left->in_color = INTERVAL_BLACK;
+                                        tmp->in_color = INTERVAL_RED;
+                                        __rotate_right(tmp, root);
+                                        tmp = parent->in_right;
+                                }
+                                tmp->in_color = parent->in_color;
+                                parent->in_color = INTERVAL_BLACK;
+                                if (tmp->in_right)
+                                    tmp->in_right->in_color = INTERVAL_BLACK;
+                                __rotate_left(parent, root);
+                                node = *root;
+                                break;
+                        }
+                } else {
+                        tmp = parent->in_left;
+                        if (node_is_red(tmp)) {
+                                tmp->in_color = INTERVAL_BLACK;
+                                parent->in_color = INTERVAL_RED;
+                                __rotate_right(parent, root);
+                                tmp = parent->in_left;
+                        }
+                        if (node_is_black_or_0(tmp->in_left) &&
+                            node_is_black_or_0(tmp->in_right)) {
+                                tmp->in_color = INTERVAL_RED;
+                                node = parent;
+                                parent = node->in_parent;
+                        } else {
+                                if (node_is_black_or_0(tmp->in_left)) {
+                                        struct interval_node *o_right;
+                                        if ((o_right = tmp->in_right))
+                                            o_right->in_color = INTERVAL_BLACK;
+                                        tmp->in_color = INTERVAL_RED;
+                                        __rotate_left(tmp, root);
+                                        tmp = parent->in_left;
+                                }
+                                tmp->in_color = parent->in_color;
+                                parent->in_color = INTERVAL_BLACK;
+                                if (tmp->in_left)
+                                        tmp->in_left->in_color = INTERVAL_BLACK;
+                                __rotate_right(parent, root);
+                                node = *root;
+                                break;
+                        }
+                }
+        }
+        if (node)
+                node->in_color = INTERVAL_BLACK;
+        EXIT;
+}
+
+/* 
+ * if the @max_high value of @node is changed, this function traverse  a path 
+ * from node  up to the root to update max_high for the whole tree.
+ */
+static void update_maxhigh(struct interval_node *node,
+                           __u64  old_maxhigh)
+{
+        __u64 left_max, right_max;
+        ENTRY;
+
+        while (node) {
+                left_max = node->in_left ? node->in_left->in_max_high : 0;
+                right_max = node->in_right ? node->in_right->in_max_high : 0;
+                node->in_max_high = max_u64(interval_high(node),
+                                            max_u64(left_max, right_max));
+
+                if (node->in_max_high >= old_maxhigh)
+                        break;
+                node = node->in_parent;
+        }
+        EXIT;
+}
+
+void interval_erase(struct interval_node *node,
+                    struct interval_node **root)
+{
+        struct interval_node *child, *parent;
+        int color;
+        ENTRY;
+
+        if (!node->in_left) {
+                child = node->in_right;
+        } else if (!node->in_right) {
+                child = node->in_left;
+        } else { /* Both left and right child are not NULL */
+                struct interval_node *old = node;
+
+                node = interval_next(node);
+                child = node->in_right;
+                parent = node->in_parent;
+                color = node->in_color;
+
+                if (child)
+                        child->in_parent = parent;
+                if (parent == old) {
+                        parent->in_right = child;
+                        parent = node;
+                } else {
+                        parent->in_left = child;
+                }
+
+                node->in_color = old->in_color;
+                node->in_right = old->in_right;
+                node->in_left = old->in_left;
+                node->in_parent = old->in_parent;
+
+                if (old->in_parent) {
+                        if (node_is_left_child(old))
+                                old->in_parent->in_left = node;
+                        else
+                                old->in_parent->in_right = node;
+                } else {
+                        *root = node;
+                }
+
+                old->in_left->in_parent = node;
+                if (old->in_right)
+                        old->in_right->in_parent = node;
+                update_maxhigh(child, node->in_max_high);
+                update_maxhigh(node, old->in_max_high);
+                goto color;
+        }
+        parent = node->in_parent;
+        color = node->in_color;
+
+        if (child)
+                child->in_parent = parent;
+        if (parent) {
+                if (node_is_left_child(node))
+                        parent->in_left = child;
+                else
+                        parent->in_right = child;
+        } else {
+                *root = child;
+        }
+
+        update_maxhigh(child, node->in_max_high);
+
+color:
+        if (color == INTERVAL_BLACK)
+                interval_erase_color(child, parent, root);
+        EXIT;
+}
+EXPORT_SYMBOL(interval_erase);
+
+static inline int interval_may_overlap(struct interval_node *node,
+                                          struct interval_node_extent *ext)
+{
+        return (ext->start <= node->in_max_high &&
+                ext->end >= interval_low(node));
+}
+
+/*
+ * This function finds all intervals that overlap interval ext,
+ * and calls func to handle resulted intervals one by one.
+ * in lustre, this function will find all conflicting locks in
+ * the granted queue and add these locks to the ast work list.
+ *
+ * {
+ *       if (node == NULL)
+ *               return 0;
+ *       if (ext->end < interval_low(node)) {
+ *               interval_search(node->in_left, ext, func, data);
+ *       } else if (interval_may_overlap(node, ext)) {
+ *               if (extent_overlapped(ext, &node->in_extent))
+ *                       func(node, data);
+ *               interval_search(node->in_left, ext, func, data);
+ *               interval_search(node->in_right, ext, func, data);
+ *       }
+ *       return 0;
+ * }
+ *
+ */
+enum interval_iter interval_search(struct interval_node *node,
+                                   struct interval_node_extent *ext,
+                                   interval_callback_t func,
+                                   void *data)
+{
+        struct interval_node *parent;
+        enum interval_iter rc = INTERVAL_ITER_CONT;
+
+        LASSERT(ext != NULL);
+        LASSERT(func != NULL);
+
+        while (node) {
+                if (ext->end < interval_low(node)) {
+                        if (node->in_left) {
+                                node = node->in_left;
+                                continue;
+                        }
+                } else if (interval_may_overlap(node, ext)) {
+                        if (extent_overlapped(ext, &node->in_extent)) {
+                                rc = func(node, data);
+                                if (rc == INTERVAL_ITER_STOP)
+                                        break;
+                        }
+
+                        if (node->in_left) {
+                                node = node->in_left;
+                                continue;
+                        }
+                        if (node->in_right) {
+                                node = node->in_right;
+                                continue;
+                        }
+                } 
+
+                parent = node->in_parent;
+                while (parent) {
+                        if (node_is_left_child(node) &&
+                            parent->in_right) {
+                                /* If we ever got the left, it means that the 
+                                 * parent met ext->end<interval_low(parent), or
+                                 * may_overlap(parent). If the former is true,
+                                 * we needn't go back. So stop early and check
+                                 * may_overlap(parent) after this loop.  */
+                                node = parent->in_right;
+                                break;
+                        }
+                        node = parent;
+                        parent = parent->in_parent;
+                }
+                if (parent == NULL || !interval_may_overlap(parent, ext))
+                        break;
+        }
+
+        return rc;
+}
+EXPORT_SYMBOL(interval_search);
+
+static enum interval_iter interval_overlap_cb(struct interval_node *n,
+                                              void *args)
+{
+        *(int *)args = 1;
+        return INTERVAL_ITER_STOP;
+}
+
+int interval_is_overlapped(struct interval_node *root,
+                           struct interval_node_extent *ext)
+{
+        int has = 0;
+        (void)interval_search(root, ext, interval_overlap_cb, &has);
+        return has;
+}
+EXPORT_SYMBOL(interval_is_overlapped);
+
+/* Don't expand to low. Expanding downwards is expensive, and meaningless to
+ * some extents, because programs seldom do IO backward.
+ *
+ * The recursive algorithm of expanding low:
+ * expand_low {
+ *        struct interval_node *tmp;
+ *        static __u64 res = 0;
+ *
+ *        if (root == NULL)
+ *                return res;
+ *        if (root->in_max_high < low) {
+ *                res = max_u64(root->in_max_high + 1, res);
+ *                return res;
+ *        } else if (low < interval_low(root)) {
+ *                interval_expand_low(root->in_left, low);
+ *                return res;
+ *        }
+ *
+ *        if (interval_high(root) < low)
+ *                res = max_u64(interval_high(root) + 1, res);
+ *        interval_expand_low(root->in_left, low);
+ *        interval_expand_low(root->in_right, low);
+ *
+ *        return res;
+ * }
+ *
+ * It's much easy to eliminate the recursion, see interval_search for 
+ * an example. -jay
+ */
+static inline __u64 interval_expand_low(struct interval_node *root, __u64 low)
+{
+        /* we only concern the empty tree right now. */
+        if (root == NULL)
+                return 0;
+        return low;
+}
+
+static inline __u64 interval_expand_high(struct interval_node *node, __u64 high)
+{
+        __u64 result = ~0;
+
+        while (node != NULL) {
+                if (node->in_max_high < high)
+                        break;
+                        
+                if (interval_low(node) > high) {
+                        result = interval_low(node) - 1;
+                        node = node->in_left;
+                } else {
+                        node = node->in_right;
+                }
+        }
+
+        return result;
+}
+
+/* expanding the extent based on @ext. */
+void interval_expand(struct interval_node *root,
+                     struct interval_node_extent *ext,
+                     struct interval_node_extent *limiter)
+{
+        /* The assertion of interval_is_overlapped is expensive because we may
+         * travel many nodes to find the overlapped node. */
+        LASSERT(interval_is_overlapped(root, ext) == 0);
+        if (!limiter || limiter->start < ext->start)
+                ext->start = interval_expand_low(root, ext->start);
+        if (!limiter || limiter->end > ext->end)
+                ext->end = interval_expand_high(root, ext->end);
+        LASSERT(interval_is_overlapped(root, ext) == 0);
+}
+EXPORT_SYMBOL(interval_expand);
index 1880c43..f1f88ce 100644 (file)
 
 #include "ldlm_internal.h"
 
 
 #include "ldlm_internal.h"
 
+#define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1)
+
+/* fixup the ldlm_extent after expanding */
+static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req,
+                                              struct ldlm_extent *new_ex,
+                                              int conflicting)
+{
+        ldlm_mode_t req_mode = req->l_req_mode;
+        __u64 req_start = req->l_req_extent.start;
+        __u64 req_end = req->l_req_extent.end;
+        __u64 req_align, mask;
+        if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) {
+                if (req_end < req_start + LDLM_MAX_GROWN_EXTENT)
+                        new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT,
+                                          new_ex->end);
+        }
+
+        if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) {
+                EXIT;
+                return;
+        }
+
+        /* we need to ensure that the lock extent is properly aligned to what
+         * the client requested.  We align it to the lowest-common denominator
+         * of the clients requested lock start and end alignment. */
+        mask = 0x1000ULL;
+        req_align = (req_end + 1) | req_start;
+        if (req_align != 0) {
+                while ((req_align & mask) == 0)
+                        mask <<= 1;
+        }
+        mask -= 1;
+        /* We can only shrink the lock, not grow it.
+         * This should never cause lock to be smaller than requested,
+         * since requested lock was already aligned on these boundaries. */
+        new_ex->start = ((new_ex->start - 1) | mask) + 1;
+        new_ex->end = ((new_ex->end + 1) & ~mask) - 1;
+        LASSERTF(new_ex->start <= req_start,
+                 "mask "LPX64" grant start "LPU64" req start "LPU64"\n",
+                 mask, new_ex->start, req_start);
+        LASSERTF(new_ex->end >= req_end,
+                 "mask "LPX64" grant end "LPU64" req end "LPU64"\n",
+                 mask, new_ex->end, req_end);
+}
+
+/* The purpose of this function is to return:
+ * - the maximum extent
+ * - containing the requested extent
+ * - and not overlapping existing conflicting extents outside the requested one
+ *
+ * Use interval tree to expand the lock extent for granted lock.
+ */
+static void ldlm_extent_internal_policy_granted(struct ldlm_lock *req,
+                                                struct ldlm_extent *new_ex)
+{
+        struct ldlm_resource *res = req->l_resource;
+        ldlm_mode_t req_mode = req->l_req_mode;
+        __u64 req_start = req->l_req_extent.start;
+        __u64 req_end = req->l_req_extent.end;
+        struct ldlm_interval_tree *tree;
+        struct interval_node_extent limiter = { new_ex->start, new_ex->end };
+        int conflicting = 0;
+        int idx;
+        ENTRY;
+
+        lockmode_verify(req_mode);
+
+        /* using interval tree to handle the ldlm extent granted locks */
+        for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                struct interval_node_extent ext = { req_start, req_end };
+
+                tree = &res->lr_itree[idx];
+                if (lockmode_compat(tree->lit_mode, req_mode))
+                        continue;
+
+                conflicting += tree->lit_size;
+                if (conflicting > 4)
+                        limiter.start = req_start;
+
+                if (interval_is_overlapped(tree->lit_root, &ext))
+                        printk("req_mode = %d, tree->lit_mode = %d, tree->lit_size = %d\n",
+                               req_mode, tree->lit_mode, tree->lit_size);
+                interval_expand(tree->lit_root, &ext, &limiter);
+                limiter.start = max(limiter.start, ext.start);
+                limiter.end = min(limiter.end, ext.end);
+                if (limiter.start == req_start && limiter.end == req_end)
+                        break;
+        }
+
+        new_ex->start = limiter.start;
+        new_ex->end = limiter.end;
+        LASSERT(new_ex->start <= req_start);
+        LASSERT(new_ex->end >= req_end);
+
+        ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
+        EXIT;
+}
+
 /* The purpose of this function is to return:
  * - the maximum extent
  * - containing the requested extent
  * - and not overlapping existing conflicting extents outside the requested one
  */
 static void
 /* The purpose of this function is to return:
  * - the maximum extent
  * - containing the requested extent
  * - and not overlapping existing conflicting extents outside the requested one
  */
 static void
-ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req,
-                            struct ldlm_extent *new_ex)
+ldlm_extent_internal_policy_waiting(struct ldlm_lock *req,
+                                    struct ldlm_extent *new_ex)
 {
         struct list_head *tmp;
 {
         struct list_head *tmp;
+        struct ldlm_resource *res = req->l_resource;
         ldlm_mode_t req_mode = req->l_req_mode;
         __u64 req_start = req->l_req_extent.start;
         __u64 req_end = req->l_req_extent.end;
         ldlm_mode_t req_mode = req->l_req_mode;
         __u64 req_start = req->l_req_extent.start;
         __u64 req_end = req->l_req_extent.end;
-        __u64 req_align, mask;
         int conflicting = 0;
         ENTRY;
 
         lockmode_verify(req_mode);
 
         int conflicting = 0;
         ENTRY;
 
         lockmode_verify(req_mode);
 
-        list_for_each(tmp, queue) {
+        /* for waiting locks */
+        list_for_each(tmp, &res->lr_waiting) {
                 struct ldlm_lock *lock;
                 struct ldlm_extent *l_extent;
 
                 struct ldlm_lock *lock;
                 struct ldlm_extent *l_extent;
 
@@ -85,16 +185,14 @@ ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req,
                         new_ex->start = req_start;
 
                 /* If lock doesn't overlap new_ex, skip it. */
                         new_ex->start = req_start;
 
                 /* If lock doesn't overlap new_ex, skip it. */
-                if (l_extent->end < new_ex->start ||
-                    l_extent->start > new_ex->end)
+                if (!ldlm_extent_overlap(l_extent, new_ex))
                         continue;
 
                 /* Locks conflicting in requested extents and we can't satisfy
                  * both locks, so ignore it.  Either we will ping-pong this
                  * extent (we would regardless of what extent we granted) or
                  * lock is unused and it shouldn't limit our extent growth. */
                         continue;
 
                 /* Locks conflicting in requested extents and we can't satisfy
                  * both locks, so ignore it.  Either we will ping-pong this
                  * extent (we would regardless of what extent we granted) or
                  * lock is unused and it shouldn't limit our extent growth. */
-                if (lock->l_req_extent.end >= req_start &&
-                    lock->l_req_extent.start <= req_end)
+                if (ldlm_extent_overlap(&lock->l_req_extent,&req->l_req_extent))
                         continue;
 
                 /* We grow extents downwards only as far as they don't overlap
                         continue;
 
                 /* We grow extents downwards only as far as they don't overlap
@@ -123,43 +221,11 @@ ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req,
                 }
         }
 
                 }
         }
 
-#define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1)
-        if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) {
-                if (req_end < req_start + LDLM_MAX_GROWN_EXTENT)
-                        new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT,
-                                          new_ex->end);
-        }
-
-        if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) {
-                EXIT;
-                return;
-        }
-
-        /* we need to ensure that the lock extent is properly aligned to what
-         * the client requested.  We align it to the lowest-common denominator
-         * of the clients requested lock start and end alignment. */
-        mask = 0x1000ULL;
-        req_align = (req_end + 1) | req_start;
-        if (req_align != 0) {
-                while ((req_align & mask) == 0)
-                        mask <<= 1;
-        }
-        mask -= 1;
-        /* We can only shrink the lock, not grow it.
-         * This should never cause lock to be smaller than requested,
-         * since requested lock was already aligned on these boundaries. */
-        new_ex->start = ((new_ex->start - 1) | mask) + 1;
-        new_ex->end = ((new_ex->end + 1) & ~mask) - 1;
-        LASSERTF(new_ex->start <= req_start,
-                 "mask "LPX64" grant start "LPU64" req start "LPU64"\n",
-                 mask, new_ex->start, req_start);
-        LASSERTF(new_ex->end >= req_end,
-                 "mask "LPX64" grant end "LPU64" req end "LPU64"\n",
-                 mask, new_ex->end, req_end);
-
+        ldlm_extent_internal_policy_fixup(req, new_ex, conflicting);
         EXIT;
 }
 
         EXIT;
 }
 
+
 /* In order to determine the largest possible extent we can grant, we need
  * to scan all of the queues. */
 static void ldlm_extent_policy(struct ldlm_resource *res,
 /* In order to determine the largest possible extent we can grant, we need
  * to scan all of the queues. */
 static void ldlm_extent_policy(struct ldlm_resource *res,
@@ -182,8 +248,8 @@ static void ldlm_extent_policy(struct ldlm_resource *res,
                 /* fast-path whole file locks */
                 return;
 
                 /* fast-path whole file locks */
                 return;
 
-        ldlm_extent_internal_policy(&res->lr_granted, lock, &new_ex);
-        ldlm_extent_internal_policy(&res->lr_waiting, lock, &new_ex);
+        ldlm_extent_internal_policy_granted(lock, &new_ex);
+        ldlm_extent_internal_policy_waiting(lock, &new_ex);
 
         if (new_ex.start != lock->l_policy_data.l_extent.start ||
             new_ex.end != lock->l_policy_data.l_extent.end) {
 
         if (new_ex.start != lock->l_policy_data.l_extent.start ||
             new_ex.end != lock->l_policy_data.l_extent.end) {
@@ -193,6 +259,42 @@ static void ldlm_extent_policy(struct ldlm_resource *res,
         }
 }
 
         }
 }
 
+struct ldlm_extent_compat_args {
+        struct list_head *work_list;
+        struct ldlm_lock *lock;
+        ldlm_mode_t mode;
+        int *compat;
+};
+
+static enum interval_iter ldlm_extent_compat_cb(struct interval_node *n,
+                                                void *data)
+{
+        struct ldlm_extent_compat_args *priv = data;
+        struct ldlm_interval *node = to_ldlm_interval(n);
+        struct list_head *work_list = priv->work_list;
+        struct ldlm_lock *lock, *enq = priv->lock;
+        ldlm_mode_t mode = priv->mode;
+        ENTRY;
+
+        LASSERT(!list_empty(&node->li_group));
+
+        list_for_each_entry(lock, &node->li_group, l_sl_policy) {
+                /* interval tree is for granted lock */
+                LASSERTF(mode == lock->l_granted_mode,
+                         "mode = %s, lock->l_granted_mode = %s\n",
+                         ldlm_lockname[mode],
+                         ldlm_lockname[lock->l_granted_mode]);
+
+                if (lock->l_blocking_ast)
+                        ldlm_add_ast_work_item(lock, enq, work_list);
+        }
+
+        if (priv->compat)
+                *priv->compat = 0;
+
+        RETURN(INTERVAL_ITER_CONT);
+}
+
 /* Determine if the lock is compatible with all locks on the queue.
  * We stop walking the queue if we hit ourselves so we don't take
  * conflicting locks enqueued after us into accound, or we'd wait forever.
 /* Determine if the lock is compatible with all locks on the queue.
  * We stop walking the queue if we hit ourselves so we don't take
  * conflicting locks enqueued after us into accound, or we'd wait forever.
@@ -209,6 +311,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 {
         struct list_head *tmp;
         struct ldlm_lock *lock;
 {
         struct list_head *tmp;
         struct ldlm_lock *lock;
+        struct ldlm_resource *res = req->l_resource;
         ldlm_mode_t req_mode = req->l_req_mode;
         __u64 req_start = req->l_req_extent.start;
         __u64 req_end = req->l_req_extent.end;
         ldlm_mode_t req_mode = req->l_req_mode;
         __u64 req_start = req->l_req_extent.start;
         __u64 req_end = req->l_req_extent.end;
@@ -218,6 +321,71 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req,
 
         lockmode_verify(req_mode);
 
 
         lockmode_verify(req_mode);
 
+        /* Using interval tree for granted lock */
+        if (queue == &res->lr_granted) {
+                struct ldlm_interval_tree *tree;
+                struct ldlm_extent_compat_args data = {.work_list = work_list,
+                                               .lock = req,
+                                               .compat = &compat };
+                struct interval_node_extent ex = { .start = req_start,
+                                                   .end = req_end };
+                int idx, rc;
+
+                for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                        tree = &res->lr_itree[idx];
+                        if (tree->lit_root == NULL) /* empty tree, skipped */
+                                continue;
+
+                        data.mode = tree->lit_mode;
+                        if (lockmode_compat(req_mode, tree->lit_mode)) {
+                                struct ldlm_interval *node;
+                                struct ldlm_extent *extent;
+
+                                if (req_mode != LCK_GROUP)
+                                        continue;
+
+                                /* group lock, grant it immediately if
+                                 * compatible */
+                                node = to_ldlm_interval(tree->lit_root);
+                                extent = ldlm_interval_extent(node);
+                                if (req->l_policy_data.l_extent.gid ==
+                                    extent->gid)
+                                        RETURN(2);
+                        }
+
+                        if (tree->lit_mode == LCK_GROUP) {
+                                if (*flags & LDLM_FL_BLOCK_NOWAIT) {
+                                        compat = -EWOULDBLOCK;
+                                        goto destroylock;
+                                }
+
+                                *flags |= LDLM_FL_NO_TIMEOUT;
+                                if (!work_list)
+                                        RETURN(0);
+
+                                /* if work list is not NULL,add all
+                                   locks in the tree to work list */
+                                compat = 0;
+                                interval_iterate(tree->lit_root,
+                                                 ldlm_extent_compat_cb, &data);
+                                continue;
+                        }
+
+                        if (!work_list) {
+                                rc = interval_is_overlapped(tree->lit_root,&ex);
+                                if (rc)
+                                        RETURN(0);
+                        } else {
+                                interval_search(tree->lit_root, &ex,
+                                                ldlm_extent_compat_cb, &data);
+                                if (!list_empty(work_list) && compat)
+                                        compat = 0;
+                        }
+                }
+                RETURN(compat);
+        }
+
+        /* for waiting queue */
         list_for_each(tmp, queue) {
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
         list_for_each(tmp, queue) {
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
@@ -444,8 +612,24 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                 unlock_res(res);
                 rc = ldlm_run_ast_work(&rpc_list, LDLM_WORK_BL_AST);
                 lock_res(res);
                 unlock_res(res);
                 rc = ldlm_run_ast_work(&rpc_list, LDLM_WORK_BL_AST);
                 lock_res(res);
-                if (rc == -ERESTART)
+
+                if (rc == -ERESTART) {
+                        /* lock was granted while resource was unlocked. */
+                        if (lock->l_granted_mode == lock->l_req_mode) {
+                                /* bug 11300: if the lock has been granted,
+                                 * break earlier because otherwise, we will go
+                                 * to restart and ldlm_resource_unlink will be
+                                 * called and it causes the interval node to be
+                                 * freed. Then we will fail at 
+                                 * ldlm_extent_add_lock() */
+                                *flags &= ~(LDLM_FL_BLOCK_GRANTED | LDLM_FL_BLOCK_CONV |
+                                            LDLM_FL_BLOCK_WAIT);
+                                GOTO(out, rc = 0);
+                        }
+
                         GOTO(restart, -ERESTART);
                         GOTO(restart, -ERESTART);
+                }
+
                 *flags |= LDLM_FL_BLOCK_GRANTED;
                 /* this way we force client to wait for the lock
                  * endlessly once the lock is enqueued -bzzz */
                 *flags |= LDLM_FL_BLOCK_GRANTED;
                 /* this way we force client to wait for the lock
                  * endlessly once the lock is enqueued -bzzz */
@@ -493,3 +677,124 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms)
 
         RETURN(kms);
 }
 
         RETURN(kms);
 }
+
+cfs_mem_cache_t *ldlm_interval_slab;
+struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock)
+{
+        struct ldlm_interval *node;
+        ENTRY;
+
+        LASSERT(lock->l_resource->lr_type == LDLM_EXTENT);
+        OBD_SLAB_ALLOC(node, ldlm_interval_slab, CFS_ALLOC_IO, sizeof(*node));
+        if (node == NULL)
+                RETURN(NULL);
+
+        CFS_INIT_LIST_HEAD(&node->li_group);
+        ldlm_interval_attach(node, lock);
+        RETURN(node);
+}
+
+void ldlm_interval_free(struct ldlm_interval *node)
+{
+        if (node) {
+                LASSERT(list_empty(&node->li_group));
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
+        }
+}
+
+/* interval tree, for LDLM_EXTENT. */
+void ldlm_interval_attach(struct ldlm_interval *n,
+                          struct ldlm_lock *l)
+{
+        LASSERT(l->l_tree_node == NULL);
+        LASSERT(l->l_resource->lr_type == LDLM_EXTENT);
+
+        list_add_tail(&l->l_sl_policy, &n->li_group);
+        l->l_tree_node = n;
+}
+
+struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l)
+{
+        struct ldlm_interval *n = l->l_tree_node;
+
+        if (n == NULL)
+                return NULL;
+
+        LASSERT(!list_empty(&n->li_group));
+        l->l_tree_node = NULL;
+        list_del_init(&l->l_sl_policy);
+
+        return (list_empty(&n->li_group) ? n : NULL);
+}
+
+static inline int lock_mode_to_index(ldlm_mode_t mode)
+{
+        int index;
+
+        LASSERT(mode != 0);
+        LASSERT(IS_PO2(mode));
+        for (index = -1; mode; index++, mode >>= 1) ;
+        LASSERT(index < LCK_MODE_NUM);
+        return index;
+}
+
+void ldlm_extent_add_lock(struct ldlm_resource *res,
+                          struct ldlm_lock *lock)
+{
+        struct interval_node *found, **root;
+        struct ldlm_interval *node;
+        struct ldlm_extent *extent;
+        int idx;
+
+        LASSERT(lock->l_granted_mode == lock->l_req_mode);
+
+        node = lock->l_tree_node;
+        LASSERT(node != NULL);
+
+        idx = lock_mode_to_index(lock->l_granted_mode);
+        LASSERT(lock->l_granted_mode == 1 << idx);
+        LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode);
+
+        /* node extent initialize */
+        extent = &lock->l_policy_data.l_extent;
+        interval_set(&node->li_node, extent->start, extent->end);
+
+        root = &res->lr_itree[idx].lit_root;
+        found = interval_insert(&node->li_node, root);
+        if (found) { /* The policy group found. */
+                struct ldlm_interval *tmp = ldlm_interval_detach(lock);
+                LASSERT(tmp != NULL);
+                ldlm_interval_free(tmp);
+                ldlm_interval_attach(to_ldlm_interval(found), lock);
+        }
+        res->lr_itree[idx].lit_size++;
+
+        /* even though we use interval tree to manage the extent lock, we also
+         * add the locks into grant list, for debug purpose, .. */
+        ldlm_resource_add_lock(res, &res->lr_granted, lock);
+}
+
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock)
+{
+        struct ldlm_resource *res = lock->l_resource;
+        struct ldlm_interval *node;
+        struct ldlm_interval_tree *tree;
+        int idx;
+
+        if (lock->l_granted_mode != lock->l_req_mode)
+                return;
+
+        LASSERT(lock->l_tree_node != NULL);
+        idx = lock_mode_to_index(lock->l_granted_mode);
+        LASSERT(lock->l_granted_mode == 1 << idx);
+        tree = &res->lr_itree[idx];
+
+        LASSERT(tree->lit_root != NULL); /* assure the tree is not null */
+
+        tree->lit_size--;
+        node = ldlm_interval_detach(lock);
+        if (node) {
+                interval_erase(&node->li_node, &tree->lit_root);
+                ldlm_interval_free(node);
+        }
+}
index 294ba72..3127e8c 100644 (file)
@@ -112,6 +112,8 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, int *flags, int first_enq,
 /* ldlm_extent.c */
 int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                              ldlm_error_t *err, struct list_head *work_list);
 /* ldlm_extent.c */
 int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                              ldlm_error_t *err, struct list_head *work_list);
+void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock);
+void ldlm_extent_unlink_lock(struct ldlm_lock *lock);
 
 /* ldlm_flock.c */
 int ldlm_process_flock_lock(struct ldlm_lock *req, int *flags, int first_enq,
 
 /* ldlm_flock.c */
 int ldlm_process_flock_lock(struct ldlm_lock *req, int *flags, int first_enq,
@@ -137,6 +139,23 @@ struct ldlm_state {
         struct ldlm_bl_pool *ldlm_bl_pool;
 };
 
         struct ldlm_bl_pool *ldlm_bl_pool;
 };
 
+/* interval tree, for LDLM_EXTENT. */
+extern cfs_mem_cache_t *ldlm_interval_slab; /* slab cache for ldlm_interval */
+extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l);
+extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock);
+extern void ldlm_interval_free(struct ldlm_interval *node);
+/* this function must be called with res lock held */
+static inline struct ldlm_extent *
+ldlm_interval_extent(struct ldlm_interval *node)
+{
+        struct ldlm_lock *lock;
+        LASSERT(!list_empty(&node->li_group));
+
+        lock = list_entry(node->li_group.next, struct ldlm_lock, l_sl_policy);
+        return &lock->l_policy_data.l_extent;
+}
+
 int ldlm_init(void);
 void ldlm_exit(void);
 
 int ldlm_init(void);
 void ldlm_exit(void);
 
index abc77f6..d541298 100644 (file)
@@ -162,6 +162,7 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 if (lock->l_lvb_data != NULL)
                         OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
 
                 if (lock->l_lvb_data != NULL)
                         OBD_FREE(lock->l_lvb_data, lock->l_lvb_len);
 
+                ldlm_interval_free(ldlm_interval_detach(lock));
                 OBD_FREE_RCU_CB(lock, sizeof(*lock), &lock->l_handle, 
                                ldlm_lock_free);
         }
                 OBD_FREE_RCU_CB(lock, sizeof(*lock), &lock->l_handle, 
                                ldlm_lock_free);
         }
@@ -879,6 +880,8 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
         lock->l_granted_mode = lock->l_req_mode;
         if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
                 ldlm_grant_lock_with_skiplist(lock);
         lock->l_granted_mode = lock->l_req_mode;
         if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS)
                 ldlm_grant_lock_with_skiplist(lock);
+        else if (res->lr_type == LDLM_EXTENT)
+                ldlm_extent_add_lock(res, lock);
         else
                 ldlm_resource_add_lock(res, &res->lr_granted, lock);
 
         else
                 ldlm_resource_add_lock(res, &res->lr_granted, lock);
 
@@ -1136,16 +1139,28 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
         lock->l_glimpse_ast = glimpse;
         lock->l_pid = cfs_curproc_pid();
 
         lock->l_glimpse_ast = glimpse;
         lock->l_pid = cfs_curproc_pid();
 
+        lock->l_tree_node = NULL;
+        /* if this is the extent lock, allocate the interval tree node */
+        if (type == LDLM_EXTENT) {
+                if (ldlm_interval_alloc(lock) == NULL)
+                        GOTO(out, 0);
+        }
+
         if (lvb_len) {
                 lock->l_lvb_len = lvb_len;
                 OBD_ALLOC(lock->l_lvb_data, lvb_len);
         if (lvb_len) {
                 lock->l_lvb_len = lvb_len;
                 OBD_ALLOC(lock->l_lvb_data, lvb_len);
-                if (lock->l_lvb_data == NULL) {
-                        OBD_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock));
-                        RETURN(NULL);
-                }
+                if (lock->l_lvb_data == NULL)
+                        GOTO(out, 0);
         }
 
         RETURN(lock);
         }
 
         RETURN(lock);
+
+out:
+        if (lock->l_lvb_data)
+                OBD_FREE(lock->l_lvb_data, lvb_len);
+        ldlm_interval_free(ldlm_interval_detach(lock));
+        OBD_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock));
+        return NULL;
 }
 
 ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
 }
 
 ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
@@ -1157,6 +1172,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
         int local = ns_is_client(res->lr_namespace);
         ldlm_processing_policy policy;
         ldlm_error_t rc = ELDLM_OK;
         int local = ns_is_client(res->lr_namespace);
         ldlm_processing_policy policy;
         ldlm_error_t rc = ELDLM_OK;
+        struct ldlm_interval *node = NULL;
         ENTRY;
 
         do_gettimeofday(&lock->l_enqueued_time);
         ENTRY;
 
         do_gettimeofday(&lock->l_enqueued_time);
@@ -1183,16 +1199,35 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
                 }
         }
 
                 }
         }
 
+        /* For a replaying lock, it might be already in granted list. So
+         * unlinking the lock will cause the interval node to be freed, we
+         * have to allocate the interval node early otherwise we can't regrant
+         * this lock in the future. - jay */
+        if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT)
+                OBD_SLAB_ALLOC(node, ldlm_interval_slab, CFS_ALLOC_IO,
+                               sizeof(*node));
+
         lock_res_and_lock(lock);
         if (local && lock->l_req_mode == lock->l_granted_mode) {
         lock_res_and_lock(lock);
         if (local && lock->l_req_mode == lock->l_granted_mode) {
-                /* The server returned a blocked lock, but it was granted before
-                 * we got a chance to actually enqueue it.  We don't need to do
-                 * anything else. */
+                /* The server returned a blocked lock, but it was granted
+                 * before we got a chance to actually enqueue it.  We don't
+                 * need to do anything else. */
                 *flags &= ~(LDLM_FL_BLOCK_GRANTED |
                             LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT);
                 GOTO(out, ELDLM_OK);
         }
 
                 *flags &= ~(LDLM_FL_BLOCK_GRANTED |
                             LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT);
                 GOTO(out, ELDLM_OK);
         }
 
+        ldlm_resource_unlink_lock(lock);
+        if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) {
+                if (node == NULL) {
+                        ldlm_lock_destroy_nolock(lock);
+                        GOTO(out, rc = -ENOMEM);
+                }
+
+                ldlm_interval_attach(node, lock);
+                node = NULL;
+        }
+
         /* Some flags from the enqueue want to make it into the AST, via the
          * lock's l_flags. */
         lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA;
         /* Some flags from the enqueue want to make it into the AST, via the
          * lock's l_flags. */
         lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA;
@@ -1208,7 +1243,6 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
          *
          * FIXME (bug 268): Detect obvious lies by checking compatibility in
          * granted/converting queues. */
          *
          * FIXME (bug 268): Detect obvious lies by checking compatibility in
          * granted/converting queues. */
-        ldlm_resource_unlink_lock(lock);
         if (local) {
                 if (*flags & LDLM_FL_BLOCK_CONV)
                         ldlm_resource_add_lock(res, &res->lr_converting, lock);
         if (local) {
                 if (*flags & LDLM_FL_BLOCK_CONV)
                         ldlm_resource_add_lock(res, &res->lr_converting, lock);
@@ -1236,6 +1270,8 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
         GOTO(out, rc);
 out:
         unlock_res_and_lock(lock);
         GOTO(out, rc);
 out:
         unlock_res_and_lock(lock);
+        if (node)
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
         return rc;
 }
 
         return rc;
 }
 
@@ -1656,6 +1692,7 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
         struct ldlm_lock *mark_lock = NULL;
         int join = LDLM_JOIN_NONE;
         ldlm_error_t err;
         struct ldlm_lock *mark_lock = NULL;
         int join = LDLM_JOIN_NONE;
         ldlm_error_t err;
+        struct ldlm_interval *node;
         ENTRY;
 
         if (new_mode == lock->l_granted_mode) { // No changes? Just return.
         ENTRY;
 
         if (new_mode == lock->l_granted_mode) { // No changes? Just return.
@@ -1663,6 +1700,12 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
                 RETURN(lock->l_resource);
         }
 
                 RETURN(lock->l_resource);
         }
 
+        /* I can't check the type of lock here because the bitlock of lock
+         * is not held here, so do the allocation blindly. -jay */
+        OBD_SLAB_ALLOC(node, ldlm_interval_slab, CFS_ALLOC_IO, sizeof(*node));
+        if (node == NULL)  /* Actually, this causes EDEADLOCK to be returned */
+                RETURN(NULL);
+
         LASSERTF(new_mode == LCK_PW && lock->l_granted_mode == LCK_PR,
                  "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
 
         LASSERTF(new_mode == LCK_PW && lock->l_granted_mode == LCK_PR,
                  "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode);
 
@@ -1698,8 +1741,17 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
                 else if (lock->l_res_link.next != &res->lr_granted)
                         mark_lock = list_entry(lock->l_res_link.next,
                                                struct ldlm_lock, l_res_link);
                 else if (lock->l_res_link.next != &res->lr_granted)
                         mark_lock = list_entry(lock->l_res_link.next,
                                                struct ldlm_lock, l_res_link);
+        } else {
+                ldlm_resource_unlink_lock(lock);
+                if (res->lr_type == LDLM_EXTENT) {
+                        /* FIXME: ugly code, I have to attach the lock to a 
+                         * interval node again since perhaps it will be granted
+                         * soon */
+                        CFS_INIT_LIST_HEAD(&node->li_group);
+                        ldlm_interval_attach(node, lock);
+                        node = NULL;
+                }
         }
         }
-        ldlm_resource_unlink_lock(lock);
 
         /* If this is a local resource, put it on the appropriate list. */
         if (ns_is_client(res->lr_namespace)) {
 
         /* If this is a local resource, put it on the appropriate list. */
         if (ns_is_client(res->lr_namespace)) {
@@ -1725,7 +1777,11 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
                 rc = policy(lock, &pflags, 0, &err, &rpc_list);
                 if (rc == LDLM_ITER_STOP) {
                         lock->l_req_mode = old_mode;
                 rc = policy(lock, &pflags, 0, &err, &rpc_list);
                 if (rc == LDLM_ITER_STOP) {
                         lock->l_req_mode = old_mode;
-                        ldlm_granted_list_add_lock(lock, mark_lock, join);
+                        if (res->lr_type == LDLM_EXTENT)
+                                ldlm_extent_add_lock(res, lock);
+                        else
+                                ldlm_granted_list_add_lock(lock, mark_lock,
+                                                           join);
                         res = NULL;
                 } else {
                         *flags |= LDLM_FL_BLOCK_GRANTED;
                         res = NULL;
                 } else {
                         *flags |= LDLM_FL_BLOCK_GRANTED;
@@ -1736,6 +1792,8 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
 
         if (granted)
                 ldlm_run_ast_work(&rpc_list, LDLM_WORK_CP_AST);
 
         if (granted)
                 ldlm_run_ast_work(&rpc_list, LDLM_WORK_CP_AST);
+        if (node)
+                OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node));
         RETURN(res);
 }
 
         RETURN(res);
 }
 
index cc802d1..1b0437a 100644 (file)
@@ -1340,7 +1340,26 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
 
         LDLM_DEBUG(lock, "client completion callback handler START");
 
 
         LDLM_DEBUG(lock, "client completion callback handler START");
 
+        if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) {
+                int to = cfs_time_seconds(1);
+                while (to > 0) {
+                        to = schedule_timeout(to);
+                        if (lock->l_granted_mode == lock->l_req_mode ||
+                            lock->l_destroyed)
+                                break;
+                }
+        }
+
         lock_res_and_lock(lock);
         lock_res_and_lock(lock);
+        if (lock->l_destroyed ||
+            lock->l_granted_mode == lock->l_req_mode) {
+                /* bug 11300: the lock has already been granted */
+                unlock_res_and_lock(lock);
+                LDLM_DEBUG(lock, "Double grant race happened");
+                LDLM_LOCK_PUT(lock);
+                EXIT;
+                return;
+        }
 
         /* If we receive the completion AST before the actual enqueue returned,
          * then we might need to switch lock modes, resources, or extents. */
 
         /* If we receive the completion AST before the actual enqueue returned,
          * then we might need to switch lock modes, resources, or extents. */
@@ -1633,6 +1652,15 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 RETURN(0);
         }
 
                 RETURN(0);
         }
 
+        /* Force a known safe race, send a cancel to the server for a lock
+         * which the server has already started a blocking callback on. */
+        if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) &&
+            lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) {
+                rc = ldlm_cli_cancel(&dlm_req->lock_handle[0]);
+                if (rc < 0)
+                        CERROR("ldlm_cli_cancel: %d\n", rc);
+        }
+
         lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle[0]);
         if (!lock) {
                 CDEBUG(D_DLMTRACE, "callback on lock "LPX64" - lock "
         lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle[0]);
         if (!lock) {
                 CDEBUG(D_DLMTRACE, "callback on lock "LPX64" - lock "
@@ -2172,6 +2200,15 @@ int __init ldlm_init(void)
                 return -ENOMEM;
         }
 
                 return -ENOMEM;
         }
 
+        ldlm_interval_slab = cfs_mem_cache_create("interval_node",
+                                        sizeof(struct ldlm_interval),
+                                        0, SLAB_HWCACHE_ALIGN);
+        if (ldlm_interval_slab == NULL) {
+                cfs_mem_cache_destroy(ldlm_resource_slab);
+                cfs_mem_cache_destroy(ldlm_lock_slab);
+                return -ENOMEM;
+        }
+
         return 0;
 }
 
         return 0;
 }
 
@@ -2184,6 +2221,8 @@ void __exit ldlm_exit(void)
         LASSERTF(rc == 0, "couldn't free ldlm resource slab\n");
         rc = cfs_mem_cache_destroy(ldlm_lock_slab);
         LASSERTF(rc == 0, "couldn't free ldlm lock slab\n");
         LASSERTF(rc == 0, "couldn't free ldlm resource slab\n");
         rc = cfs_mem_cache_destroy(ldlm_lock_slab);
         LASSERTF(rc == 0, "couldn't free ldlm lock slab\n");
+        rc = cfs_mem_cache_destroy(ldlm_interval_slab);
+        LASSERTF(rc == 0, "couldn't free interval node slab\n");
 }
 
 /* ldlm_extent.c */
 }
 
 /* ldlm_extent.c */
index 2f74265..b87ab5f 100644 (file)
@@ -622,6 +622,7 @@ static __u32 ldlm_hash_fn(struct ldlm_resource *parent,
 static struct ldlm_resource *ldlm_resource_new(void)
 {
         struct ldlm_resource *res;
 static struct ldlm_resource *ldlm_resource_new(void)
 {
         struct ldlm_resource *res;
+        int idx;
 
         OBD_SLAB_ALLOC(res, ldlm_resource_slab, CFS_ALLOC_IO, sizeof *res);
         if (res == NULL)
 
         OBD_SLAB_ALLOC(res, ldlm_resource_slab, CFS_ALLOC_IO, sizeof *res);
         if (res == NULL)
@@ -634,6 +635,14 @@ static struct ldlm_resource *ldlm_resource_new(void)
         CFS_INIT_LIST_HEAD(&res->lr_granted);
         CFS_INIT_LIST_HEAD(&res->lr_converting);
         CFS_INIT_LIST_HEAD(&res->lr_waiting);
         CFS_INIT_LIST_HEAD(&res->lr_granted);
         CFS_INIT_LIST_HEAD(&res->lr_converting);
         CFS_INIT_LIST_HEAD(&res->lr_waiting);
+
+        /* initialize interval trees for each lock mode*/
+        for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                res->lr_itree[idx].lit_size = 0;
+                res->lr_itree[idx].lit_mode = 1 << idx;
+                res->lr_itree[idx].lit_root = NULL;
+        }
+
         atomic_set(&res->lr_refcount, 1);
         spin_lock_init(&res->lr_lock);
 
         atomic_set(&res->lr_refcount, 1);
         spin_lock_init(&res->lr_lock);
 
@@ -905,8 +914,13 @@ void ldlm_resource_insert_lock_after(struct ldlm_lock *original,
 
 void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
 {
 
 void ldlm_resource_unlink_lock(struct ldlm_lock *lock)
 {
+        int type = lock->l_resource->lr_type;
+
         check_res_locked(lock->l_resource);
         check_res_locked(lock->l_resource);
-        ldlm_unlink_lock_skiplist(lock);
+        if (type == LDLM_IBITS || type == LDLM_PLAIN)
+                ldlm_unlink_lock_skiplist(lock);
+        else if (type == LDLM_EXTENT)
+                ldlm_extent_unlink_lock(lock);
         list_del_init(&lock->l_res_link);
 }
 
         list_del_init(&lock->l_res_link);
 }
 
index 399698f..275d523 100644 (file)
@@ -1566,6 +1566,53 @@ static int filter_destroy_internal(struct obd_device *obd, obd_id objid,
         return(rc);
 }
 
         return(rc);
 }
 
+struct filter_intent_args {
+        struct ldlm_lock **victim;
+        __u64 size;
+        int *liblustre;
+};
+
+static enum interval_iter filter_intent_cb(struct interval_node *n,
+                                           void *args)
+{
+        struct ldlm_interval *node = (struct ldlm_interval *)n;
+        struct filter_intent_args *arg = (struct filter_intent_args*)args;
+        __u64 size = arg->size;
+        struct ldlm_lock **v = arg->victim;
+        struct ldlm_lock *lck;
+
+        /* If the interval is lower than the current file size,
+         * just break. */
+        if (interval_high(n) <= size)
+                return INTERVAL_ITER_STOP;
+
+        list_for_each_entry(lck, &node->li_group, l_sl_policy) {
+                /* Don't send glimpse ASTs to liblustre clients.
+                 * They aren't listening for them, and they do
+                 * entirely synchronous I/O anyways. */
+                if (lck->l_export == NULL ||
+                    lck->l_export->exp_libclient == 1)
+                        continue;
+
+                if (*arg->liblustre)
+                        *arg->liblustre = 0;
+
+                if (*v == NULL) {
+                        *v = LDLM_LOCK_GET(lck);
+                } else if ((*v)->l_policy_data.l_extent.start <
+                           lck->l_policy_data.l_extent.start) {
+                        LDLM_LOCK_PUT(*v);
+                        *v = LDLM_LOCK_GET(lck);
+                }
+
+                /* the same policy group - every lock has the
+                 * same extent, so needn't do it any more */
+                break;
+        }
+
+        return INTERVAL_ITER_CONT;
+}
+
 static int filter_intent_policy(struct ldlm_namespace *ns,
                                 struct ldlm_lock **lockp, void *req_cookie,
                                 ldlm_mode_t mode, int flags, void *data)
 static int filter_intent_policy(struct ldlm_namespace *ns,
                                 struct ldlm_lock **lockp, void *req_cookie,
                                 ldlm_mode_t mode, int flags, void *data)
@@ -1577,9 +1624,10 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
         ldlm_processing_policy policy;
         struct ost_lvb *res_lvb, *reply_lvb;
         struct ldlm_reply *rep;
         ldlm_processing_policy policy;
         struct ost_lvb *res_lvb, *reply_lvb;
         struct ldlm_reply *rep;
-        struct list_head *tmp;
         ldlm_error_t err;
         ldlm_error_t err;
-        int rc, tmpflags = 0, only_liblustre = 0;
+        int idx, rc, tmpflags = 0, only_liblustre = 1;
+        struct ldlm_interval_tree *tree;
+        struct filter_intent_args arg;
         int repsize[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                            [DLM_LOCKREPLY_OFF]   = sizeof(*rep),
                            [DLM_REPLY_REC_OFF]   = sizeof(*reply_lvb) };
         int repsize[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body),
                            [DLM_LOCKREPLY_OFF]   = sizeof(*rep),
                            [DLM_REPLY_REC_OFF]   = sizeof(*reply_lvb) };
@@ -1604,7 +1652,9 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
 
         /* If we grant any lock at all, it will be a whole-file read lock.
          * Call the extent policy function to see if our request can be
 
         /* If we grant any lock at all, it will be a whole-file read lock.
          * Call the extent policy function to see if our request can be
-         * granted, or is blocked. */
+         * granted, or is blocked. 
+         * If the OST lock has LDLM_FL_HAS_INTENT set, it means a glimpse lock
+         */
         lock->l_policy_data.l_extent.start = 0;
         lock->l_policy_data.l_extent.end = OBD_OBJECT_EOF;
         lock->l_req_mode = LCK_PR;
         lock->l_policy_data.l_extent.start = 0;
         lock->l_policy_data.l_extent.end = OBD_OBJECT_EOF;
         lock->l_req_mode = LCK_PR;
@@ -1652,42 +1702,23 @@ static int filter_intent_policy(struct ldlm_namespace *ns,
         LASSERT(res_lvb != NULL);
         *reply_lvb = *res_lvb;
 
         LASSERT(res_lvb != NULL);
         *reply_lvb = *res_lvb;
 
-        list_for_each(tmp, &res->lr_granted) {
-                struct ldlm_lock *tmplock =
-                        list_entry(tmp, struct ldlm_lock, l_res_link);
-
-                if (tmplock->l_granted_mode == LCK_PR)
-                        continue;
-                /*
-                 * ->ns_lock guarantees that no new locks are granted, and,
-                 * therefore, that res->lr_lvb_data cannot increase beyond the
-                 * end of already granted lock. As a result, it is safe to
-                 * check against "stale" reply_lvb->lvb_size value without
-                 * res->lr_lvb_sem.
-                 */
-                if (tmplock->l_policy_data.l_extent.end <= reply_lvb->lvb_size)
-                        continue;
-
-                /* Don't send glimpse ASTs to liblustre clients.  They aren't
-                 * listening for them, and they do entirely synchronous I/O
-                 * anyways. */
-                if (tmplock->l_export == NULL ||
-                    tmplock->l_export->exp_libclient == 1) {
-                        only_liblustre = 1;
-                        continue;
-                }
-
-                if (l == NULL) {
-                        l = LDLM_LOCK_GET(tmplock);
-                        continue;
-                }
-
-                if (l->l_policy_data.l_extent.start >
-                    tmplock->l_policy_data.l_extent.start)
+        /*
+         * ->ns_lock guarantees that no new locks are granted, and,
+         * therefore, that res->lr_lvb_data cannot increase beyond the
+         * end of already granted lock. As a result, it is safe to
+         * check against "stale" reply_lvb->lvb_size value without
+         * res->lr_lvb_sem.
+         */
+        arg.size = reply_lvb->lvb_size;
+        arg.victim = &l;
+        arg.liblustre = &only_liblustre;
+        for (idx = 0; idx < LCK_MODE_NUM; idx++) {
+                tree = &res->lr_itree[idx];
+                if (tree->lit_mode == LCK_PR)
                         continue;
 
                         continue;
 
-                LDLM_LOCK_PUT(l);
-                l = LDLM_LOCK_GET(tmplock);
+                interval_iterate_reverse(tree->lit_root, 
+                                         filter_intent_cb, &arg);
         }
         unlock_res(res);
 
         }
         unlock_res(res);
 
index caab57d..9ade7d5 100644 (file)
@@ -10,6 +10,7 @@ ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o
 ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
 ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
 ldlm_objs += $(LDLM)ldlm_pool.o
 ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o
 ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o
 ldlm_objs += $(LDLM)ldlm_pool.o
+ldlm_objs += $(LDLM)interval_tree.o
 ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
 ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o recov_thread.o
 ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
 ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o
 ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o recov_thread.o
 ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o
@@ -29,6 +30,9 @@ ldlm_%.c: @LUSTRE@/ldlm/ldlm_%.c
 l_lock.c: @LUSTRE@/ldlm/l_lock.c
        ln -sf $< $@
 
 l_lock.c: @LUSTRE@/ldlm/l_lock.c
        ln -sf $< $@
 
+interval_tree.c: @LUSTRE@/ldlm/interval_tree.c
+       ln -sf $< $@
+
 EXTRA_PRE_CFLAGS := -I@LUSTRE@/ldlm
 
 @INCLUDE_RULES@
 EXTRA_PRE_CFLAGS := -I@LUSTRE@/ldlm
 
 @INCLUDE_RULES@
index 7d4754e..d94f0e4 100644 (file)
@@ -5,6 +5,7 @@
 
 LDLM_COMM_SOURCES= $(top_srcdir)/lustre/ldlm/l_lock.c  \
        $(top_srcdir)/lustre/ldlm/ldlm_lock.c           \
 
 LDLM_COMM_SOURCES= $(top_srcdir)/lustre/ldlm/l_lock.c  \
        $(top_srcdir)/lustre/ldlm/ldlm_lock.c           \
+       $(top_srcdir)/lustre/ldlm/interval_tree.c       \
        $(top_srcdir)/lustre/ldlm/ldlm_resource.c       \
        $(top_srcdir)/lustre/ldlm/ldlm_lib.c            \
        $(top_srcdir)/lustre/ldlm/ldlm_plain.c          \
        $(top_srcdir)/lustre/ldlm/ldlm_resource.c       \
        $(top_srcdir)/lustre/ldlm/ldlm_lib.c            \
        $(top_srcdir)/lustre/ldlm/ldlm_plain.c          \
@@ -86,4 +87,4 @@ endif
 
 install-data-hook: $(install_data_hook)
 DIST_SOURCES = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h
 
 install-data-hook: $(install_data_hook)
 DIST_SOURCES = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  ldlm_*.c l_lock.c
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  ldlm_*.c l_lock.c interval_tree.c
index d187035..6fe15c5 100644 (file)
@@ -185,6 +185,8 @@ void lustre_assert_wire_constants(void)
                  (long long)LCK_GROUP);
         LASSERTF(LCK_MAXMODE == 65, " found %lld\n",
                  (long long)LCK_MAXMODE);
                  (long long)LCK_GROUP);
         LASSERTF(LCK_MAXMODE == 65, " found %lld\n",
                  (long long)LCK_MAXMODE);
+        LASSERTF(LCK_MODE_NUM == 7, " found %lld\n",
+                 (long long)LCK_MODE_NUM);
         CLASSERT(LDLM_PLAIN == 10);
         CLASSERT(LDLM_EXTENT == 11);
         CLASSERT(LDLM_FLOCK == 12);
         CLASSERT(LDLM_PLAIN == 10);
         CLASSERT(LDLM_EXTENT == 11);
         CLASSERT(LDLM_FLOCK == 12);
index 6854a75..7a565f2 100644 (file)
@@ -26,7 +26,7 @@ EXTRA_DIST = $(noinst_SCRIPTS) $(noinst_DATA) \
              $(nobase_noinst_SCRIPTS) $(nobase_noinst_DATA)
 
 if TESTS
              $(nobase_noinst_SCRIPTS) $(nobase_noinst_DATA)
 
 if TESTS
-noinst_PROGRAMS = openunlink truncate directio openme writeme mlink utime
+noinst_PROGRAMS = openunlink truncate directio openme writeme mlink utime it_test
 noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy
 noinst_PROGRAMS += createmany chownmany statmany multifstat createtest
 noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test checkstat
 noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy
 noinst_PROGRAMS += createmany chownmany statmany multifstat createtest
 noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test checkstat
diff --git a/lustre/tests/it_test.c b/lustre/tests/it_test.c
new file mode 100644 (file)
index 0000000..dae233b
--- /dev/null
@@ -0,0 +1,519 @@
+/* vi:set ts=8 sw=8 expandtab: */
+/* Unit test tool for interval tree.
+ * Written by jay <jxiong@clusterfs.com> 
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include <../ldlm/interval_tree.c>
+
+#define dprintf(fmt, args...) //printf(fmt, ##args)
+#define error(fmt, args...) do {                        \
+        fflush(stdout), fflush(stderr);                 \
+        fprintf(stderr, "\nError:" fmt, ##args);        \
+        abort();                                        \
+} while(0)
+
+#define __F(ext)         (ext)->start, (ext)->end
+#define __S              "[%llx:%llx]"
+
+#define ALIGN_SIZE       4096
+#define ALIGN_MASK       (~(ALIGN_SIZE - 1))
+
+static struct it_node {
+        struct interval_node node;
+        struct list_head list;
+        int hit, valid;
+} *it_array;
+static int it_count;
+static struct list_head header = LIST_HEAD_INIT(header);
+static unsigned long max_count = ULONG_MAX & ALIGN_MASK;
+static int have_wide_lock = 0;
+
+static void it_test_clear(void)
+{
+        int i = 0;
+        for (i = 0; i < it_count; i++)
+                it_array[i].hit = 0;
+}
+
+static enum interval_iter cb(struct interval_node *n, void *args)
+{
+        struct it_node *node = (struct it_node *)n;
+        static int count = 1;
+
+        if (node->hit == 1) {
+                dprintf("NODE "__S" has ever been accessed\n",
+                        __F(&n->in_extent));
+                return INTERVAL_ITER_CONT;
+                error("duplicate node accessing found\n");
+                return INTERVAL_ITER_STOP;
+        }
+        
+        if (node->valid == 0) {
+                error("A deleted node "__S" being accessed\n",
+                       __F(&n->in_extent));
+                return INTERVAL_ITER_STOP;
+        }
+
+        if (count++ == 8) {
+                dprintf("\n");
+                count = 1;
+        }
+        dprintf(""__S" ", __F(&n->in_extent));
+        fflush(stdout);
+
+        node->hit = 1;
+        return INTERVAL_ITER_CONT;
+}
+
+static int it_test_search(struct interval_node *root)
+{
+        struct it_node *n;
+        struct interval_node_extent ext;
+        int times = 10, i, err = 0;
+
+        while (times--) {
+                it_test_clear();
+                ext.start = (random() % max_count) & ALIGN_MASK;
+                ext.end = random() % (max_count - ext.start + 2) + ext.start;
+                ext.end &= ALIGN_MASK;
+                if (ext.end > max_count)
+                        ext.end = max_count;
+
+                dprintf("\n\nSearching the node overlapped "__S" ..\n",
+                        __F(&ext));
+
+                interval_search(root, &ext, cb, NULL);
+
+                dprintf("\nverifing ...");
+        
+                /* verify */
+                for (i = 0; i < it_count; i++) {
+                        n = &it_array[i];
+                        if (n->valid == 0)
+                                continue;
+
+                        if (extent_overlapped(&ext, &n->node.in_extent) && 
+                            n->hit == 0)
+                                error("node "__S" overlaps" __S","
+                                      "but never to be hit.\n", 
+                                      __F(&n->node.in_extent),
+                                      __F(&ext));
+
+                        if (!extent_overlapped(&ext, &n->node.in_extent) && 
+                            n->hit)
+                                error("node "__S" overlaps" __S", but hit.\n", 
+                                      __F(&n->node.in_extent),
+                                      __F(&ext));
+                }
+                if (err) error("search error\n");
+                dprintf("ok.\n");
+        }
+
+        return 0;
+}
+
+static int it_test_iterate(struct interval_node *root)
+{
+        int i;
+
+        dprintf("\n\nIterate testing start..\n");
+
+        it_test_clear();
+        interval_iterate(root, cb, NULL);
+
+        /* verify */
+        for (i = 0; i < it_count; i++) {
+                if (it_array[i].valid == 0)
+                        continue;
+                if (it_array[i].hit == 0)
+                        error("Node "__S" is not accessed\n",
+                              __F(&it_array[i].node.in_extent));
+        }
+
+        return 0;
+}
+
+static int it_test_iterate_reverse(struct interval_node *root)
+{
+        int i;
+
+        dprintf("\n\niterate reverse testing start..\n");
+        it_test_clear();
+        interval_iterate_reverse(root, cb, NULL);
+
+        /* verify */
+        for (i = 0; i < it_count; i++) {
+                if (it_array[i].valid == 0)
+                        continue;
+                if (it_array[i].hit == 0)
+                        error("Not every extent is accessed\n");
+        }
+
+        return 0;
+}
+
+static int it_test_find(struct interval_node *root)
+{
+        int idx;
+        struct interval_node_extent *ext;
+
+        dprintf("\ninterval_find testing start ..\n");
+        for (idx = 0; idx < it_count; idx++) {
+                if (it_array[idx].valid == 0)
+                        continue;
+
+                ext = &it_array[idx].node.in_extent;
+                dprintf("Try to find "__S"\n", __F(ext));
+                if (!interval_find(root, ext))
+                        error("interval_find, try to find "__S"\n", __F(ext));
+        }
+        return 0;
+}
+
+/* sanity test is tightly coupled with implementation, so when you changed
+ * the interval tree implementation, change this code also. */
+static enum interval_iter sanity_cb(struct interval_node *node, void *args)
+{
+        __u64 max_high = node->in_max_high;
+        struct interval_node *tmp, *parent;
+        int left = 1, has = 0, nr = 0;
+
+        parent = node->in_parent;
+        node->in_parent = NULL;
+        interval_for_each(tmp, node) {
+                if ((left && node_compare(tmp, node) > 0) ||
+                    (!left && node_compare(tmp, node) < 0))
+                        error("interval tree sanity test\n");
+
+                if (tmp->in_max_high > max_high) {
+                        dprintf("max high sanity check, max_high is %llu,"
+                                "child max_high: %llu"__S"\n",
+                                max_high, tmp->in_max_high,
+                                __F(&tmp->in_extent));
+                        goto err;
+                } else if (tmp->in_max_high == max_high) {
+                        has = 1;
+                }
+
+                if (tmp == node) {
+                        left = 0;
+                        continue;
+                }
+        }
+
+        if (!has) {
+                int count = 1;
+err:
+                dprintf("node"__S":%llu Child list:\n",
+                        node->in_extent.start,
+                        node->in_extent.end,
+                        node->in_max_high);
+
+                interval_for_each(tmp, node) {
+                        dprintf(""__S":%llu ",
+                                __F(&tmp->in_extent),
+                                tmp->in_max_high);
+                        if (count++ == 8) {
+                                dprintf("\n");
+                                count = 1;
+                        }
+                }
+
+                error("max high sanity check, has == %d\n", has);
+        }
+        node->in_parent = parent;
+
+        tmp = node;
+        while (tmp) {
+                if (node_is_black(tmp))
+                        nr++;
+                else if ((tmp->in_left && node_is_red(tmp->in_left)) ||
+                         (tmp->in_right && node_is_red(tmp->in_right)))
+                        error("wrong tree, a red node has red child\n");
+                tmp = tmp->in_left;
+        }
+
+        tmp = node;
+        while (tmp) {
+                if (node_is_black(tmp))
+                        nr--;
+                tmp = tmp->in_right;
+        }
+        if (nr)
+                error("wrong tree, unbalanced!\n");
+        
+        return 0;
+}
+
+static int it_test_sanity(struct interval_node *root)
+{
+        it_test_clear();
+        interval_iterate(root, sanity_cb, NULL);
+        return 0;
+}
+
+static int it_test_search_hole(struct interval_node *root)
+{
+        int i, count = 10;
+        struct interval_node_extent ext, ext2;
+        struct it_node *n;
+        __u64 low = 0, high = ~0;
+
+        do {
+                if (--count == 0)
+                        return 0;
+
+                ext.start = random() % max_count;
+                ext.end = ext.start;
+        } while (interval_is_overlapped(root, &ext));
+        ext2 = ext;
+
+        interval_expand(root, &ext, NULL);
+        dprintf("Extending "__S" to .."__S"\n", __F(&ext2), __F(&ext));
+        for (i = 0; i < it_count; i++) {
+                n = &it_array[i];
+                if (n->valid == 0)
+                        continue;
+
+                if (extent_overlapped(&ext, &n->node.in_extent)) {
+                        error("Extending "__S" to .."__S" overlaps node"__S"\n",
+                                __F(&ext2), __F(&ext), __F(&n->node.in_extent));
+                }
+
+                if (n->node.in_extent.end < ext2.start)
+                        low = max_u64(n->node.in_extent.end + 1, low);
+
+                if (n->node.in_extent.start > ext2.end)
+                        high = min_u64(n->node.in_extent.start - 1, high);
+        }
+
+        /* only expanding high right now */
+        if (ext2.start != ext.start || high != ext.end) {
+                ext2.start = low, ext2.end = high;
+                error("Real extending result:"__S", expected:"__S"\n",
+                       __F(&ext), __F(&ext2));
+        }
+
+        return 0;
+}
+
+static int contended_count = 0; 
+#define LOOP_COUNT 1000
+static enum interval_iter perf_cb(struct interval_node *n, void *args)
+{
+        unsigned long count = LOOP_COUNT;
+        while (count--);
+        contended_count++;
+        return INTERVAL_ITER_CONT;
+}
+
+static inline long tv_delta(struct timeval *s, struct timeval *e)
+{
+        long c = e->tv_sec - s->tv_sec;
+        c *= 1000;
+        c += (long int)(e->tv_usec - s->tv_usec) / 1000;
+        dprintf("\tStart: %lu:%lu -> End: %lu:%lu\n", 
+                s->tv_sec, s->tv_usec, e->tv_sec, e->tv_usec);
+        return c;
+}
+
+static int it_test_performance(struct interval_node *root, unsigned long len)
+{
+        int i = 0, interval_time, list_time;
+        struct interval_node_extent ext;
+        struct it_node *n;
+        struct timeval start, end;
+        unsigned long count;
+        
+        ext.start = (random() % (max_count - len)) & ALIGN_MASK;
+        ext.end = (ext.start + len) & ALIGN_MASK;
+        if (have_wide_lock) {
+                ext.start = (max_count - len) & ALIGN_MASK;
+                ext.end = max_count;
+        }
+
+        dprintf("Extent search"__S"\n", __F(&ext));
+
+        /* list */
+        contended_count = 0;
+        gettimeofday(&start, NULL);
+        list_for_each_entry(n, &header, list) {
+                if (extent_overlapped(&ext, &n->node.in_extent)) {
+                        count = LOOP_COUNT;
+                        while (count--);
+                        contended_count++;
+                }
+        }
+        gettimeofday(&end, NULL);
+        list_time = tv_delta(&start, &end);
+        i = contended_count;
+
+        /* interval */
+        contended_count = 0;
+        gettimeofday(&start, NULL);
+        interval_search(root, &ext, perf_cb, &contended_count);
+        gettimeofday(&end, NULL);
+        interval_time = tv_delta(&start, &end);
+
+        if (i != contended_count)
+                error("count of contended lock don't match(%d: %d)\n",
+                      i, contended_count);
+
+        printf("\tList vs Int. search: \n\t\t"
+               "(%d vs %d)ms, %d contended lock.\n",
+                list_time, interval_time, contended_count);
+
+        return 0;
+}
+
+static struct interval_node *it_test_helper(struct interval_node *root)
+{
+        int idx, count = 0;
+        struct it_node *n;
+
+        count = random() % it_count;
+        while (count--) {
+                idx = random() % it_count;
+                n = &it_array[idx];
+                if (n->valid) {
+                        if (!interval_find(root, &n->node.in_extent))
+                                error("Cannot find an existent node\n");
+                        dprintf("Erasing a node "__S"\n", 
+                                __F(&n->node.in_extent));
+                        interval_erase(&n->node, &root);
+                        n->valid = 0;
+                        list_del_init(&n->list);
+                } else {
+                        __u64 low, high;
+                        low = (random() % max_count) & ALIGN_MASK;
+                        high = ((random() % max_count + 1) & ALIGN_MASK) + low;
+                        if (high > max_count)
+                                high = max_count;
+                        interval_set(&n->node, low, high);
+                        while (interval_insert(&n->node, &root))
+                                interval_set(&n->node, low, ++high);
+                        dprintf("Adding a node "__S"\n", 
+                                __F(&n->node.in_extent));
+                        n->valid = 1;
+                        list_add(&n->list, &header);
+                }
+        }
+
+        return root;
+}
+
+static struct interval_node *it_test_init(int count)
+{
+        int i;
+        uint64_t high, low, len;
+        struct it_node *n;
+        struct interval_node *root = NULL;
+
+        it_count = count;
+        it_array = (struct it_node *)malloc(sizeof(struct it_node) * count);
+        if (it_array == NULL)
+                error("it_array == NULL, no memory\n");
+
+        have_wide_lock = 0;
+        for (i = 0; i < count; i++) {
+                n = &it_array[i];
+                do {
+                        low = (random() % max_count + 1) & ALIGN_MASK;
+                        len = (random() % 256 + 1) * ALIGN_SIZE;
+                        if (!have_wide_lock && !(random() % count)) {
+                                low = 0;
+                                len = max_count;
+                                have_wide_lock = 1;
+                        }
+                        high = low + (len & ALIGN_MASK);
+
+                        interval_set(&n->node, low, high);
+                } while (interval_insert(&n->node, &root));
+                n->hit = 0;
+                n->valid = 1;
+                if (i == 0)
+                        list_add_tail(&n->list, &header);
+                else
+                        list_add_tail(&n->list, &it_array[rand()%i].list);
+        }
+
+        return root;
+}
+
+static void it_test_fini(void)
+{
+        free(it_array);
+        it_array = NULL;
+        it_count = 0;
+        max_count = 0;
+}
+
+int main(int argc, char *argv[])
+{
+        int count = 5, perf = 0;
+        struct interval_node *root;
+        struct timeval tv;
+
+        gettimeofday(&tv, NULL);
+        srandom(tv.tv_usec);
+
+        if (argc == 2) {
+                if (strcmp(argv[1], "-p"))
+                        error("Unknow options, usage: %s [-p]\n", argv[0]);
+                perf = 1;
+                count = 1;
+        }
+
+        if (perf) {
+                int M = 1024 * 1024;
+                root = it_test_init(1000000);
+                printf("1M locks with 4K request size\n");
+                it_test_performance(root, 4096);
+                printf("1M locks with 128K request size\n");
+                it_test_performance(root, 128 * 1024);
+                printf("1M locks with 256K request size\n");
+                it_test_performance(root, 256 * 1024);
+                printf("1M locks with 1M request size\n");
+                it_test_performance(root, 1 * M);
+                printf("1M locks with 16M request size\n");
+                it_test_performance(root, 16 * M);
+                printf("1M locks with 32M request size\n");
+                it_test_performance(root, 32 * M);
+                printf("1M locks with 64M request size\n");
+                it_test_performance(root, 64 * M);
+                printf("1M locks with 128M request size\n");
+                it_test_performance(root, 128 * M);
+                printf("1M locks with 256M request size\n");
+                it_test_performance(root, 256 * M);
+                printf("1M locks with 512M request size\n");
+                it_test_performance(root, 512 * M);
+                printf("1M locks with 1G request size\n");
+                it_test_performance(root, 1024 * M);
+                printf("1M locks with 2G request size\n");
+                it_test_performance(root, 2048 * M);
+                printf("1M locks with 3G request size\n");
+                it_test_performance(root, 3072 * M);
+                printf("1M locks with 4G request size\n");
+                it_test_performance(root, max_count - 1);
+                it_test_fini();
+                return 0;
+        }
+
+        root = it_test_init(random() % 100000 + 1000);
+        while (count--) {
+                it_test_sanity(root);
+                it_test_iterate(root);
+                it_test_iterate_reverse(root);
+                it_test_find(root);
+                it_test_search_hole(root);
+                it_test_search(root);
+                root = it_test_helper(root);
+        }
+        it_test_fini();
+
+        return 0;
+}
index 3c9b4b6..0f6b129 100644 (file)
@@ -606,6 +606,18 @@ test_30() { #bug #11110
 
 run_test 30 "recreate file race ========="
 
 
 run_test 30 "recreate file race ========="
 
+test_31() {
+        mkdir -p $DIR1/$tdir || error "Creating dir $DIR1/$tdir"
+        writes=`LANG=C dd if=/dev/zero of=$DIR/$tdir/$tfile count=1 2>&1 |
+                awk 'BEGIN { FS="+" } /out/ {print $1}'`
+        #define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE   0x314
+        sysctl -w lustre.fail_loc=0x314
+        reads=`LANG=C dd if=$DIR2/$tdir/$tfile of=/dev/null 2>&1 |
+               awk 'BEGIN { FS="+" } /in/ {print $1}'`
+        [ $reads -eq $writes ] || error "read" $reads "blocks, must be" $writes
+}
+run_test 31 "voluntary cancel / blocking ast race=============="
+
 log "cleanup: ======================================================"
 
 check_and_cleanup_lustre
 log "cleanup: ======================================================"
 
 check_and_cleanup_lustre
index 628edfa..e963f86 100644 (file)
@@ -203,6 +203,8 @@ void lustre_assert_wire_constants(void)
                  (long long)LCK_GROUP);
         LASSERTF(LCK_MAXMODE == 65, " found %lld\n",
                  (long long)LCK_MAXMODE);
                  (long long)LCK_GROUP);
         LASSERTF(LCK_MAXMODE == 65, " found %lld\n",
                  (long long)LCK_MAXMODE);
+        LASSERTF(LCK_MODE_NUM == 7, " found %lld\n",
+                 (long long)LCK_MODE_NUM);
         CLASSERT(LDLM_PLAIN == 10);
         CLASSERT(LDLM_EXTENT == 11);
         CLASSERT(LDLM_FLOCK == 12);
         CLASSERT(LDLM_PLAIN == 10);
         CLASSERT(LDLM_EXTENT == 11);
         CLASSERT(LDLM_FLOCK == 12);