From 022b102258cd85314f4fa1fb8322638cc79f4634 Mon Sep 17 00:00:00 2001 From: jxiong Date: Thu, 6 Mar 2008 04:20:32 +0000 Subject: [PATCH] b=11300 r=adilger,oleg,nikita,vitaly Using interval tree manages the extent locks to accelerate the evicting of contended locks. --- lustre/include/Makefile.am | 2 +- lustre/include/interval_tree.h | 106 ++++++ lustre/include/lustre/lustre_idl.h | 8 + lustre/include/lustre_dlm.h | 22 ++ lustre/include/obd_support.h | 5 + lustre/ldlm/Makefile.am | 3 +- lustre/ldlm/interval_tree.c | 752 +++++++++++++++++++++++++++++++++++++ lustre/ldlm/ldlm_extent.c | 395 ++++++++++++++++--- lustre/ldlm/ldlm_internal.h | 19 + lustre/ldlm/ldlm_lock.c | 78 +++- lustre/ldlm/ldlm_lockd.c | 39 ++ lustre/ldlm/ldlm_resource.c | 16 +- lustre/obdfilter/filter.c | 105 ++++-- lustre/ptlrpc/Makefile.in | 4 + lustre/ptlrpc/autoMakefile.am | 3 +- lustre/ptlrpc/wiretest.c | 2 + lustre/tests/Makefile.am | 2 +- lustre/tests/it_test.c | 519 +++++++++++++++++++++++++ lustre/tests/sanityN.sh | 12 + lustre/utils/wiretest.c | 2 + 20 files changed, 1997 insertions(+), 97 deletions(-) create mode 100644 lustre/include/interval_tree.h create mode 100644 lustre/ldlm/interval_tree.c create mode 100644 lustre/tests/it_test.c diff --git a/lustre/include/Makefile.am b/lustre/include/Makefile.am index 27e62fc..7c99903 100644 --- a/lustre/include/Makefile.am +++ b/lustre/include/Makefile.am @@ -16,4 +16,4 @@ EXTRA_DIST = ioctl.h liblustre.h lprocfs_status.h lustre_cfg.h \ obd_ost.h obd_support.h lustre_ver.h lu_object.h lu_time.h \ md_object.h dt_object.h lustre_param.h lustre_mdt.h \ lustre_fid.h lustre_fld.h lustre_req_layout.h lustre_capa.h \ - lustre_idmap.h lustre_eacl.h + lustre_idmap.h lustre_eacl.h interval_tree.h diff --git a/lustre/include/interval_tree.h b/lustre/include/interval_tree.h new file mode 100644 index 0000000..41436c1 --- /dev/null +++ b/lustre/include/interval_tree.h @@ -0,0 +1,106 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * (visit-tags-table FILE) + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002, 2007 Cluster File Systems, Inc. + * Author: Huang Wei + * Author: Jay Xiong + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * You may have signed or agreed to another license before downloading + * this software. If so, you are bound by the terms and conditions + * of that agreement, and the following does not apply to you. See the + * LICENSE file included with this distribution for more information. + * + * If you did not agree to a different license, then this copy of Lustre + * is open source software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In either case, Lustre is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * license text for more details. + */ + +#ifndef _INTERVAL_H__ +#define _INTERVAL_H__ + +#include /* __u8, __u64 etc. */ +#include /* LASSERT. */ + +struct interval_node { + struct interval_node *in_left; + struct interval_node *in_right; + struct interval_node *in_parent; + __u8 in_color; + __u8 res1[7]; /* tags, 8-bytes aligned */ + __u64 in_max_high; + struct interval_node_extent { + __u64 start; + __u64 end; + } in_extent; +}; + +enum interval_iter { + INTERVAL_ITER_CONT = 1, + INTERVAL_ITER_STOP = 2 +}; + +static inline __u64 interval_low(struct interval_node *node) +{ + return node->in_extent.start; +} + +static inline __u64 interval_high(struct interval_node *node) +{ + return node->in_extent.end; +} + +static inline void interval_set(struct interval_node *node, + __u64 start, __u64 end) +{ + LASSERT(start <= end); + node->in_extent.start = start; + node->in_extent.end = end; + node->in_max_high = end; +} + +/* Rules to write an interval callback. + * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration + * should be stopped. It will then cause the iteration function to return + * immediately with return value INTERVAL_ITER_STOP. + * - callbacks for interval_iterate and interval_iterate_reverse: Every + * nodes in the tree will be set to @node before the callback being called + * - callback for interval_search: Only overlapped node will be set to @node + * before the callback being called. + */ +typedef enum interval_iter (*interval_callback_t)(struct interval_node *node, + void *args); + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root); +void interval_erase(struct interval_node *node, struct interval_node **root); + +/* Search the extents in the tree and call @func for each overlapped + * extents. */ +enum interval_iter interval_search(struct interval_node *root, + struct interval_node_extent *ex, + interval_callback_t func, void *data); + +/* Iterate every node in the tree - by reverse order or regular order. */ +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, void *data); +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func,void *data); + +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter); +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ex); +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex); +#endif diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index be6b8d1..2a8786f 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -1652,6 +1652,8 @@ typedef enum { LCK_MAXMODE } ldlm_mode_t; +#define LCK_MODE_NUM 7 + typedef enum { LDLM_PLAIN = 10, LDLM_EXTENT = 11, @@ -1668,6 +1670,12 @@ struct ldlm_extent { __u64 gid; }; +static inline int ldlm_extent_overlap(struct ldlm_extent *ex1, + struct ldlm_extent *ex2) +{ + return (ex1->start <= ex2->end) && (ex2->start <= ex1->end); +} + struct ldlm_inodebits { __u64 bits; }; diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 7789517..48722d5 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -21,6 +21,7 @@ #include #include #include /* for obd_export, for LDLM_DEBUG */ +#include /* for interval_node{}, ldlm_extent */ struct obd_ops; struct obd_device; @@ -365,6 +366,23 @@ typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags, void *data); typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data); +/* Interval node data for each LDLM_EXTENT lock */ +struct ldlm_interval { + struct interval_node li_node; /* node for tree mgmt */ + struct list_head li_group; /* the locks which have the same + * policy - group of the policy */ +}; +#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node) + +/* the interval tree must be accessed inside the resource lock. */ +struct ldlm_interval_tree { + /* tree size, this variable is used to count + * granted PW locks in ldlm_extent_policy()*/ + int lit_size; + ldlm_mode_t lit_mode; /* lock mode */ + struct interval_node *lit_root; /* actually ldlm_interval */ +}; + struct ldlm_lock { struct portals_handle l_handle; // must be first in the structure atomic_t l_refc; @@ -385,6 +403,8 @@ struct ldlm_lock { struct list_head l_sl_mode; // skip pointer for request mode struct list_head l_sl_policy; // skip pointer for inodebits + struct ldlm_interval *l_tree_node; /* tree node for ldlm_extent */ + /* protected by led_lock */ struct list_head l_export_chain; // per-export chain of locks @@ -458,6 +478,8 @@ struct ldlm_resource { struct ldlm_res_id lr_name; atomic_t lr_refcount; + struct ldlm_interval_tree lr_itree[LCK_MODE_NUM]; /* interval trees*/ + /* Server-side-only lock value block elements */ struct semaphore lr_lvb_sem; __u32 lr_lvb_len; diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index d029d99..8f0143f 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -197,6 +197,11 @@ int __obd_fail_check_set(__u32 id, __u32 value, int set); #define OBD_FAIL_LDLM_GLIMPSE 0x30f #define OBD_FAIL_LDLM_CANCEL_RACE 0x310 #define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311 +/* +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 +#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 +*/ +#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 #define OBD_FAIL_OSC 0x400 #define OBD_FAIL_OSC_BRW_READ_BULK 0x401 diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am index 7beda3d..b5993fc 100644 --- a/lustre/ldlm/Makefile.am +++ b/lustre/ldlm/Makefile.am @@ -10,4 +10,5 @@ MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ DIST_SOURCES = ldlm_extent.c ldlm_flock.c ldlm_internal.h ldlm_lib.c \ ldlm_lock.c ldlm_lockd.c ldlm_plain.c ldlm_request.c \ - ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c + ldlm_resource.c l_lock.c ldlm_inodebits.c ldlm_pool.c \ + interval_tree.c diff --git a/lustre/ldlm/interval_tree.c b/lustre/ldlm/interval_tree.c new file mode 100644 index 0000000..bedf5b3 --- /dev/null +++ b/lustre/ldlm/interval_tree.c @@ -0,0 +1,752 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Interval tree library used by ldlm extent lock code + * + * Copyright (c) 2007 Cluster File Systems, Inc. + * Author: Huang Wei + * Author: Jay Xiong + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * You may have signed or agreed to another license before downloading + * this software. If so, you are bound by the terms and conditions + * of that agreement, and the following does not apply to you. See the + * LICENSE file included with this distribution for more information. + * + * If you did not agree to a different license, then this copy of Lustre + * is open source software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In either case, Lustre is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * license text for more details. + */ +#ifdef __KERNEL__ +# include +#else +# include +# include +#endif +#include +#include + +enum { + INTERVAL_RED = 0, + INTERVAL_BLACK = 1 +}; + +static inline int node_is_left_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_left; +} + +static inline int node_is_right_child(struct interval_node *node) +{ + LASSERT(node->in_parent != NULL); + return node == node->in_parent->in_right; +} + +static inline int node_is_red(struct interval_node *node) +{ + return node->in_color == INTERVAL_RED; +} + +static inline int node_is_black(struct interval_node *node) +{ + return node->in_color == INTERVAL_BLACK; +} + +static inline int extent_compare(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + int rc; + if (e1->start == e2->start) { + if (e1->end < e2->end) + rc = -1; + else if (e1->end > e2->end) + rc = 1; + else + rc = 0; + } else { + if (e1->start < e2->start) + rc = -1; + else + rc = 1; + } + return rc; +} + +static inline int extent_equal(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start == e2->start) && (e1->end == e2->end); +} + +static inline int extent_overlapped(struct interval_node_extent *e1, + struct interval_node_extent *e2) +{ + return (e1->start <= e2->end) && (e2->start <= e1->end); +} + +static inline int node_compare(struct interval_node *n1, + struct interval_node *n2) +{ + return extent_compare(&n1->in_extent, &n2->in_extent); +} + +static inline int node_equal(struct interval_node *n1, + struct interval_node *n2) +{ + return extent_equal(&n1->in_extent, &n2->in_extent); +} + +static inline __u64 max_u64(__u64 x, __u64 y) +{ + return x > y ? x : y; +} + +static inline __u64 min_u64(__u64 x, __u64 y) +{ + return x < y ? x : y; +} + +#define interval_for_each(node, root) \ +for (node = interval_first(root); node != NULL; \ + node = interval_next(node)) + +#define interval_for_each_reverse(node, root) \ +for (node = interval_last(root); node != NULL; \ + node = interval_prev(node)) + +static struct interval_node *interval_first(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + while (node->in_left) + node = node->in_left; + RETURN(node); +} + +static struct interval_node *interval_last(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + while (node->in_right) + node = node->in_right; + RETURN(node); +} + +static struct interval_node *interval_next(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + if (node->in_right) + RETURN(interval_first(node->in_right)); + while (node->in_parent && node_is_right_child(node)) + node = node->in_parent; + RETURN(node->in_parent); +} + +static struct interval_node *interval_prev(struct interval_node *node) +{ + ENTRY; + + if (!node) + RETURN(NULL); + + if (node->in_left) + RETURN(interval_last(node->in_left)); + + while (node->in_parent && node_is_left_child(node)) + node = node->in_parent; + + RETURN(node->in_parent); +} + +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + ENTRY; + + interval_for_each(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_iterate); + +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func, + void *data) +{ + struct interval_node *node; + enum interval_iter rc = INTERVAL_ITER_CONT; + ENTRY; + + interval_for_each_reverse(node, root) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + RETURN(rc); +} +EXPORT_SYMBOL(interval_iterate_reverse); + +/* try to find a node with same interval in the tree, + * if found, return the pointer to the node, otherwise return NULL*/ +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex) +{ + struct interval_node *walk = root; + int rc; + ENTRY; + + while (walk) { + rc = extent_compare(ex, &walk->in_extent); + if (rc == 0) + break; + else if (rc < 0) + walk = walk->in_left; + else + walk = walk->in_right; + } + + RETURN(walk); +} +EXPORT_SYMBOL(interval_find); + +static void __rotate_change_maxhigh(struct interval_node *node, + struct interval_node *rotate) +{ + __u64 left_max, right_max; + + rotate->in_max_high = node->in_max_high; + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max_u64(interval_high(node), + max_u64(left_max,right_max)); +} + +/* The left rotation "pivots" around the link from node to node->right, and + * - node will be linked to node->right's left child, and + * - node->right's left child will be linked to node's right child. */ +static void __rotate_left(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *right = node->in_right; + struct interval_node *parent = node->in_parent; + + node->in_right = right->in_left; + if (node->in_right) + right->in_left->in_parent = node; + + right->in_left = node; + right->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = right; + else + parent->in_right = right; + } else { + *root = right; + } + node->in_parent = right; + + /* update max_high for node and right */ + __rotate_change_maxhigh(node, right); +} + +/* The right rotation "pivots" around the link from node to node->left, and + * - node will be linked to node->left's right child, and + * - node->left's right child will be linked to node's left child. */ +static void __rotate_right(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *left = node->in_left; + struct interval_node *parent = node->in_parent; + + node->in_left = left->in_right; + if (node->in_left) + left->in_right->in_parent = node; + left->in_right = node; + + left->in_parent = parent; + if (parent) { + if (node_is_right_child(node)) + parent->in_right = left; + else + parent->in_left = left; + } else { + *root = left; + } + node->in_parent = left; + + /* update max_high for node and left */ + __rotate_change_maxhigh(node, left); +} + +#define interval_swap(a, b) do { \ + struct interval_node *c = a; a = b; b = c; \ +} while (0) + +/* + * Operations INSERT and DELETE, when run on a tree with n keys, + * take O(logN) time.Because they modify the tree, the result + * may violate the red-black properties.To restore these properties, + * we must change the colors of some of the nodes in the tree + * and also change the pointer structure. + */ +static void interval_insert_color(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *parent, *gparent; + ENTRY; + + while ((parent = node->in_parent) && node_is_red(parent)) { + gparent = parent->in_parent; + /* Parent is RED, so gparent must not be NULL */ + if (node_is_left_child(parent)) { + struct interval_node *uncle; + uncle = gparent->in_right; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (parent->in_right == node) { + __rotate_left(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_right(gparent, root); + } else { + struct interval_node *uncle; + uncle = gparent->in_left; + if (uncle && node_is_red(uncle)) { + uncle->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + node = gparent; + continue; + } + + if (node_is_left_child(node)) { + __rotate_right(parent, root); + interval_swap(node, parent); + } + + parent->in_color = INTERVAL_BLACK; + gparent->in_color = INTERVAL_RED; + __rotate_left(gparent, root); + } + } + + (*root)->in_color = INTERVAL_BLACK; + EXIT; +} + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root) + +{ + struct interval_node **p, *parent = NULL; + ENTRY; + + p = root; + while (*p) { + parent = *p; + if (node_equal(parent, node)) + RETURN(parent); + + /* max_high field must be updated after each iteration */ + if (parent->in_max_high < interval_high(node)) + parent->in_max_high = interval_high(node); + + if (node_compare(node, parent) < 0) + p = &parent->in_left; + else + p = &parent->in_right; + } + + /* link node into the tree */ + node->in_parent = parent; + node->in_color = INTERVAL_RED; + node->in_left = node->in_right = NULL; + *p = node; + + interval_insert_color(node, root); + + RETURN(NULL); +} +EXPORT_SYMBOL(interval_insert); + +static inline int node_is_black_or_0(struct interval_node *node) +{ + return !node || node_is_black(node); +} + +static void interval_erase_color(struct interval_node *node, + struct interval_node *parent, + struct interval_node **root) +{ + struct interval_node *tmp; + ENTRY; + + while (node_is_black_or_0(node) && node != *root) { + if (parent->in_left == node) { + tmp = parent->in_right; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_left(parent, root); + tmp = parent->in_right; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_right)) { + struct interval_node *o_left; + if ((o_left = tmp->in_left)) + o_left->in_color = INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_right(tmp, root); + tmp = parent->in_right; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_right) + tmp->in_right->in_color = INTERVAL_BLACK; + __rotate_left(parent, root); + node = *root; + break; + } + } else { + tmp = parent->in_left; + if (node_is_red(tmp)) { + tmp->in_color = INTERVAL_BLACK; + parent->in_color = INTERVAL_RED; + __rotate_right(parent, root); + tmp = parent->in_left; + } + if (node_is_black_or_0(tmp->in_left) && + node_is_black_or_0(tmp->in_right)) { + tmp->in_color = INTERVAL_RED; + node = parent; + parent = node->in_parent; + } else { + if (node_is_black_or_0(tmp->in_left)) { + struct interval_node *o_right; + if ((o_right = tmp->in_right)) + o_right->in_color = INTERVAL_BLACK; + tmp->in_color = INTERVAL_RED; + __rotate_left(tmp, root); + tmp = parent->in_left; + } + tmp->in_color = parent->in_color; + parent->in_color = INTERVAL_BLACK; + if (tmp->in_left) + tmp->in_left->in_color = INTERVAL_BLACK; + __rotate_right(parent, root); + node = *root; + break; + } + } + } + if (node) + node->in_color = INTERVAL_BLACK; + EXIT; +} + +/* + * if the @max_high value of @node is changed, this function traverse a path + * from node up to the root to update max_high for the whole tree. + */ +static void update_maxhigh(struct interval_node *node, + __u64 old_maxhigh) +{ + __u64 left_max, right_max; + ENTRY; + + while (node) { + left_max = node->in_left ? node->in_left->in_max_high : 0; + right_max = node->in_right ? node->in_right->in_max_high : 0; + node->in_max_high = max_u64(interval_high(node), + max_u64(left_max, right_max)); + + if (node->in_max_high >= old_maxhigh) + break; + node = node->in_parent; + } + EXIT; +} + +void interval_erase(struct interval_node *node, + struct interval_node **root) +{ + struct interval_node *child, *parent; + int color; + ENTRY; + + if (!node->in_left) { + child = node->in_right; + } else if (!node->in_right) { + child = node->in_left; + } else { /* Both left and right child are not NULL */ + struct interval_node *old = node; + + node = interval_next(node); + child = node->in_right; + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent == old) { + parent->in_right = child; + parent = node; + } else { + parent->in_left = child; + } + + node->in_color = old->in_color; + node->in_right = old->in_right; + node->in_left = old->in_left; + node->in_parent = old->in_parent; + + if (old->in_parent) { + if (node_is_left_child(old)) + old->in_parent->in_left = node; + else + old->in_parent->in_right = node; + } else { + *root = node; + } + + old->in_left->in_parent = node; + if (old->in_right) + old->in_right->in_parent = node; + update_maxhigh(child, node->in_max_high); + update_maxhigh(node, old->in_max_high); + goto color; + } + parent = node->in_parent; + color = node->in_color; + + if (child) + child->in_parent = parent; + if (parent) { + if (node_is_left_child(node)) + parent->in_left = child; + else + parent->in_right = child; + } else { + *root = child; + } + + update_maxhigh(child, node->in_max_high); + +color: + if (color == INTERVAL_BLACK) + interval_erase_color(child, parent, root); + EXIT; +} +EXPORT_SYMBOL(interval_erase); + +static inline int interval_may_overlap(struct interval_node *node, + struct interval_node_extent *ext) +{ + return (ext->start <= node->in_max_high && + ext->end >= interval_low(node)); +} + +/* + * This function finds all intervals that overlap interval ext, + * and calls func to handle resulted intervals one by one. + * in lustre, this function will find all conflicting locks in + * the granted queue and add these locks to the ast work list. + * + * { + * if (node == NULL) + * return 0; + * if (ext->end < interval_low(node)) { + * interval_search(node->in_left, ext, func, data); + * } else if (interval_may_overlap(node, ext)) { + * if (extent_overlapped(ext, &node->in_extent)) + * func(node, data); + * interval_search(node->in_left, ext, func, data); + * interval_search(node->in_right, ext, func, data); + * } + * return 0; + * } + * + */ +enum interval_iter interval_search(struct interval_node *node, + struct interval_node_extent *ext, + interval_callback_t func, + void *data) +{ + struct interval_node *parent; + enum interval_iter rc = INTERVAL_ITER_CONT; + + LASSERT(ext != NULL); + LASSERT(func != NULL); + + while (node) { + if (ext->end < interval_low(node)) { + if (node->in_left) { + node = node->in_left; + continue; + } + } else if (interval_may_overlap(node, ext)) { + if (extent_overlapped(ext, &node->in_extent)) { + rc = func(node, data); + if (rc == INTERVAL_ITER_STOP) + break; + } + + if (node->in_left) { + node = node->in_left; + continue; + } + if (node->in_right) { + node = node->in_right; + continue; + } + } + + parent = node->in_parent; + while (parent) { + if (node_is_left_child(node) && + parent->in_right) { + /* If we ever got the left, it means that the + * parent met ext->endin_right; + break; + } + node = parent; + parent = parent->in_parent; + } + if (parent == NULL || !interval_may_overlap(parent, ext)) + break; + } + + return rc; +} +EXPORT_SYMBOL(interval_search); + +static enum interval_iter interval_overlap_cb(struct interval_node *n, + void *args) +{ + *(int *)args = 1; + return INTERVAL_ITER_STOP; +} + +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ext) +{ + int has = 0; + (void)interval_search(root, ext, interval_overlap_cb, &has); + return has; +} +EXPORT_SYMBOL(interval_is_overlapped); + +/* Don't expand to low. Expanding downwards is expensive, and meaningless to + * some extents, because programs seldom do IO backward. + * + * The recursive algorithm of expanding low: + * expand_low { + * struct interval_node *tmp; + * static __u64 res = 0; + * + * if (root == NULL) + * return res; + * if (root->in_max_high < low) { + * res = max_u64(root->in_max_high + 1, res); + * return res; + * } else if (low < interval_low(root)) { + * interval_expand_low(root->in_left, low); + * return res; + * } + * + * if (interval_high(root) < low) + * res = max_u64(interval_high(root) + 1, res); + * interval_expand_low(root->in_left, low); + * interval_expand_low(root->in_right, low); + * + * return res; + * } + * + * It's much easy to eliminate the recursion, see interval_search for + * an example. -jay + */ +static inline __u64 interval_expand_low(struct interval_node *root, __u64 low) +{ + /* we only concern the empty tree right now. */ + if (root == NULL) + return 0; + return low; +} + +static inline __u64 interval_expand_high(struct interval_node *node, __u64 high) +{ + __u64 result = ~0; + + while (node != NULL) { + if (node->in_max_high < high) + break; + + if (interval_low(node) > high) { + result = interval_low(node) - 1; + node = node->in_left; + } else { + node = node->in_right; + } + } + + return result; +} + +/* expanding the extent based on @ext. */ +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter) +{ + /* The assertion of interval_is_overlapped is expensive because we may + * travel many nodes to find the overlapped node. */ + LASSERT(interval_is_overlapped(root, ext) == 0); + if (!limiter || limiter->start < ext->start) + ext->start = interval_expand_low(root, ext->start); + if (!limiter || limiter->end > ext->end) + ext->end = interval_expand_high(root, ext->end); + LASSERT(interval_is_overlapped(root, ext) == 0); +} +EXPORT_SYMBOL(interval_expand); diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index 1880c43..f1f88ce 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -35,26 +35,126 @@ #include "ldlm_internal.h" +#define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1) + +/* fixup the ldlm_extent after expanding */ +static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req, + struct ldlm_extent *new_ex, + int conflicting) +{ + ldlm_mode_t req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + __u64 req_align, mask; + + if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) { + if (req_end < req_start + LDLM_MAX_GROWN_EXTENT) + new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT, + new_ex->end); + } + + if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) { + EXIT; + return; + } + + /* we need to ensure that the lock extent is properly aligned to what + * the client requested. We align it to the lowest-common denominator + * of the clients requested lock start and end alignment. */ + mask = 0x1000ULL; + req_align = (req_end + 1) | req_start; + if (req_align != 0) { + while ((req_align & mask) == 0) + mask <<= 1; + } + mask -= 1; + /* We can only shrink the lock, not grow it. + * This should never cause lock to be smaller than requested, + * since requested lock was already aligned on these boundaries. */ + new_ex->start = ((new_ex->start - 1) | mask) + 1; + new_ex->end = ((new_ex->end + 1) & ~mask) - 1; + LASSERTF(new_ex->start <= req_start, + "mask "LPX64" grant start "LPU64" req start "LPU64"\n", + mask, new_ex->start, req_start); + LASSERTF(new_ex->end >= req_end, + "mask "LPX64" grant end "LPU64" req end "LPU64"\n", + mask, new_ex->end, req_end); +} + +/* The purpose of this function is to return: + * - the maximum extent + * - containing the requested extent + * - and not overlapping existing conflicting extents outside the requested one + * + * Use interval tree to expand the lock extent for granted lock. + */ +static void ldlm_extent_internal_policy_granted(struct ldlm_lock *req, + struct ldlm_extent *new_ex) +{ + struct ldlm_resource *res = req->l_resource; + ldlm_mode_t req_mode = req->l_req_mode; + __u64 req_start = req->l_req_extent.start; + __u64 req_end = req->l_req_extent.end; + struct ldlm_interval_tree *tree; + struct interval_node_extent limiter = { new_ex->start, new_ex->end }; + int conflicting = 0; + int idx; + ENTRY; + + lockmode_verify(req_mode); + + /* using interval tree to handle the ldlm extent granted locks */ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + struct interval_node_extent ext = { req_start, req_end }; + + tree = &res->lr_itree[idx]; + if (lockmode_compat(tree->lit_mode, req_mode)) + continue; + + conflicting += tree->lit_size; + if (conflicting > 4) + limiter.start = req_start; + + if (interval_is_overlapped(tree->lit_root, &ext)) + printk("req_mode = %d, tree->lit_mode = %d, tree->lit_size = %d\n", + req_mode, tree->lit_mode, tree->lit_size); + interval_expand(tree->lit_root, &ext, &limiter); + limiter.start = max(limiter.start, ext.start); + limiter.end = min(limiter.end, ext.end); + if (limiter.start == req_start && limiter.end == req_end) + break; + } + + new_ex->start = limiter.start; + new_ex->end = limiter.end; + LASSERT(new_ex->start <= req_start); + LASSERT(new_ex->end >= req_end); + + ldlm_extent_internal_policy_fixup(req, new_ex, conflicting); + EXIT; +} + /* The purpose of this function is to return: * - the maximum extent * - containing the requested extent * - and not overlapping existing conflicting extents outside the requested one */ static void -ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req, - struct ldlm_extent *new_ex) +ldlm_extent_internal_policy_waiting(struct ldlm_lock *req, + struct ldlm_extent *new_ex) { struct list_head *tmp; + struct ldlm_resource *res = req->l_resource; ldlm_mode_t req_mode = req->l_req_mode; __u64 req_start = req->l_req_extent.start; __u64 req_end = req->l_req_extent.end; - __u64 req_align, mask; int conflicting = 0; ENTRY; lockmode_verify(req_mode); - list_for_each(tmp, queue) { + /* for waiting locks */ + list_for_each(tmp, &res->lr_waiting) { struct ldlm_lock *lock; struct ldlm_extent *l_extent; @@ -85,16 +185,14 @@ ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req, new_ex->start = req_start; /* If lock doesn't overlap new_ex, skip it. */ - if (l_extent->end < new_ex->start || - l_extent->start > new_ex->end) + if (!ldlm_extent_overlap(l_extent, new_ex)) continue; /* Locks conflicting in requested extents and we can't satisfy * both locks, so ignore it. Either we will ping-pong this * extent (we would regardless of what extent we granted) or * lock is unused and it shouldn't limit our extent growth. */ - if (lock->l_req_extent.end >= req_start && - lock->l_req_extent.start <= req_end) + if (ldlm_extent_overlap(&lock->l_req_extent,&req->l_req_extent)) continue; /* We grow extents downwards only as far as they don't overlap @@ -123,43 +221,11 @@ ldlm_extent_internal_policy(struct list_head *queue, struct ldlm_lock *req, } } -#define LDLM_MAX_GROWN_EXTENT (32 * 1024 * 1024 - 1) - if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) { - if (req_end < req_start + LDLM_MAX_GROWN_EXTENT) - new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT, - new_ex->end); - } - - if (new_ex->start == 0 && new_ex->end == OBD_OBJECT_EOF) { - EXIT; - return; - } - - /* we need to ensure that the lock extent is properly aligned to what - * the client requested. We align it to the lowest-common denominator - * of the clients requested lock start and end alignment. */ - mask = 0x1000ULL; - req_align = (req_end + 1) | req_start; - if (req_align != 0) { - while ((req_align & mask) == 0) - mask <<= 1; - } - mask -= 1; - /* We can only shrink the lock, not grow it. - * This should never cause lock to be smaller than requested, - * since requested lock was already aligned on these boundaries. */ - new_ex->start = ((new_ex->start - 1) | mask) + 1; - new_ex->end = ((new_ex->end + 1) & ~mask) - 1; - LASSERTF(new_ex->start <= req_start, - "mask "LPX64" grant start "LPU64" req start "LPU64"\n", - mask, new_ex->start, req_start); - LASSERTF(new_ex->end >= req_end, - "mask "LPX64" grant end "LPU64" req end "LPU64"\n", - mask, new_ex->end, req_end); - + ldlm_extent_internal_policy_fixup(req, new_ex, conflicting); EXIT; } + /* In order to determine the largest possible extent we can grant, we need * to scan all of the queues. */ static void ldlm_extent_policy(struct ldlm_resource *res, @@ -182,8 +248,8 @@ static void ldlm_extent_policy(struct ldlm_resource *res, /* fast-path whole file locks */ return; - ldlm_extent_internal_policy(&res->lr_granted, lock, &new_ex); - ldlm_extent_internal_policy(&res->lr_waiting, lock, &new_ex); + ldlm_extent_internal_policy_granted(lock, &new_ex); + ldlm_extent_internal_policy_waiting(lock, &new_ex); if (new_ex.start != lock->l_policy_data.l_extent.start || new_ex.end != lock->l_policy_data.l_extent.end) { @@ -193,6 +259,42 @@ static void ldlm_extent_policy(struct ldlm_resource *res, } } +struct ldlm_extent_compat_args { + struct list_head *work_list; + struct ldlm_lock *lock; + ldlm_mode_t mode; + int *compat; +}; + +static enum interval_iter ldlm_extent_compat_cb(struct interval_node *n, + void *data) +{ + struct ldlm_extent_compat_args *priv = data; + struct ldlm_interval *node = to_ldlm_interval(n); + struct list_head *work_list = priv->work_list; + struct ldlm_lock *lock, *enq = priv->lock; + ldlm_mode_t mode = priv->mode; + ENTRY; + + LASSERT(!list_empty(&node->li_group)); + + list_for_each_entry(lock, &node->li_group, l_sl_policy) { + /* interval tree is for granted lock */ + LASSERTF(mode == lock->l_granted_mode, + "mode = %s, lock->l_granted_mode = %s\n", + ldlm_lockname[mode], + ldlm_lockname[lock->l_granted_mode]); + + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, enq, work_list); + } + + if (priv->compat) + *priv->compat = 0; + + RETURN(INTERVAL_ITER_CONT); +} + /* Determine if the lock is compatible with all locks on the queue. * We stop walking the queue if we hit ourselves so we don't take * conflicting locks enqueued after us into accound, or we'd wait forever. @@ -209,6 +311,7 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req, { struct list_head *tmp; struct ldlm_lock *lock; + struct ldlm_resource *res = req->l_resource; ldlm_mode_t req_mode = req->l_req_mode; __u64 req_start = req->l_req_extent.start; __u64 req_end = req->l_req_extent.end; @@ -218,6 +321,71 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req, lockmode_verify(req_mode); + /* Using interval tree for granted lock */ + if (queue == &res->lr_granted) { + struct ldlm_interval_tree *tree; + struct ldlm_extent_compat_args data = {.work_list = work_list, + .lock = req, + .compat = &compat }; + struct interval_node_extent ex = { .start = req_start, + .end = req_end }; + int idx, rc; + + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + if (tree->lit_root == NULL) /* empty tree, skipped */ + continue; + + data.mode = tree->lit_mode; + if (lockmode_compat(req_mode, tree->lit_mode)) { + struct ldlm_interval *node; + struct ldlm_extent *extent; + + if (req_mode != LCK_GROUP) + continue; + + /* group lock, grant it immediately if + * compatible */ + node = to_ldlm_interval(tree->lit_root); + extent = ldlm_interval_extent(node); + if (req->l_policy_data.l_extent.gid == + extent->gid) + RETURN(2); + } + + if (tree->lit_mode == LCK_GROUP) { + if (*flags & LDLM_FL_BLOCK_NOWAIT) { + compat = -EWOULDBLOCK; + goto destroylock; + } + + *flags |= LDLM_FL_NO_TIMEOUT; + if (!work_list) + RETURN(0); + + /* if work list is not NULL,add all + locks in the tree to work list */ + compat = 0; + interval_iterate(tree->lit_root, + ldlm_extent_compat_cb, &data); + continue; + } + + if (!work_list) { + rc = interval_is_overlapped(tree->lit_root,&ex); + if (rc) + RETURN(0); + } else { + interval_search(tree->lit_root, &ex, + ldlm_extent_compat_cb, &data); + if (!list_empty(work_list) && compat) + compat = 0; + } + } + RETURN(compat); + } + + /* for waiting queue */ list_for_each(tmp, queue) { lock = list_entry(tmp, struct ldlm_lock, l_res_link); @@ -444,8 +612,24 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq, unlock_res(res); rc = ldlm_run_ast_work(&rpc_list, LDLM_WORK_BL_AST); lock_res(res); - if (rc == -ERESTART) + + if (rc == -ERESTART) { + /* lock was granted while resource was unlocked. */ + if (lock->l_granted_mode == lock->l_req_mode) { + /* bug 11300: if the lock has been granted, + * break earlier because otherwise, we will go + * to restart and ldlm_resource_unlink will be + * called and it causes the interval node to be + * freed. Then we will fail at + * ldlm_extent_add_lock() */ + *flags &= ~(LDLM_FL_BLOCK_GRANTED | LDLM_FL_BLOCK_CONV | + LDLM_FL_BLOCK_WAIT); + GOTO(out, rc = 0); + } + GOTO(restart, -ERESTART); + } + *flags |= LDLM_FL_BLOCK_GRANTED; /* this way we force client to wait for the lock * endlessly once the lock is enqueued -bzzz */ @@ -493,3 +677,124 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms) RETURN(kms); } + +cfs_mem_cache_t *ldlm_interval_slab; +struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock) +{ + struct ldlm_interval *node; + ENTRY; + + LASSERT(lock->l_resource->lr_type == LDLM_EXTENT); + OBD_SLAB_ALLOC(node, ldlm_interval_slab, CFS_ALLOC_IO, sizeof(*node)); + if (node == NULL) + RETURN(NULL); + + CFS_INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + RETURN(node); +} + +void ldlm_interval_free(struct ldlm_interval *node) +{ + if (node) { + LASSERT(list_empty(&node->li_group)); + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); + } +} + +/* interval tree, for LDLM_EXTENT. */ +void ldlm_interval_attach(struct ldlm_interval *n, + struct ldlm_lock *l) +{ + LASSERT(l->l_tree_node == NULL); + LASSERT(l->l_resource->lr_type == LDLM_EXTENT); + + list_add_tail(&l->l_sl_policy, &n->li_group); + l->l_tree_node = n; +} + +struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l) +{ + struct ldlm_interval *n = l->l_tree_node; + + if (n == NULL) + return NULL; + + LASSERT(!list_empty(&n->li_group)); + l->l_tree_node = NULL; + list_del_init(&l->l_sl_policy); + + return (list_empty(&n->li_group) ? n : NULL); +} + +static inline int lock_mode_to_index(ldlm_mode_t mode) +{ + int index; + + LASSERT(mode != 0); + LASSERT(IS_PO2(mode)); + for (index = -1; mode; index++, mode >>= 1) ; + LASSERT(index < LCK_MODE_NUM); + return index; +} + +void ldlm_extent_add_lock(struct ldlm_resource *res, + struct ldlm_lock *lock) +{ + struct interval_node *found, **root; + struct ldlm_interval *node; + struct ldlm_extent *extent; + int idx; + + LASSERT(lock->l_granted_mode == lock->l_req_mode); + + node = lock->l_tree_node; + LASSERT(node != NULL); + + idx = lock_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == 1 << idx); + LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode); + + /* node extent initialize */ + extent = &lock->l_policy_data.l_extent; + interval_set(&node->li_node, extent->start, extent->end); + + root = &res->lr_itree[idx].lit_root; + found = interval_insert(&node->li_node, root); + if (found) { /* The policy group found. */ + struct ldlm_interval *tmp = ldlm_interval_detach(lock); + LASSERT(tmp != NULL); + ldlm_interval_free(tmp); + ldlm_interval_attach(to_ldlm_interval(found), lock); + } + res->lr_itree[idx].lit_size++; + + /* even though we use interval tree to manage the extent lock, we also + * add the locks into grant list, for debug purpose, .. */ + ldlm_resource_add_lock(res, &res->lr_granted, lock); +} + +void ldlm_extent_unlink_lock(struct ldlm_lock *lock) +{ + struct ldlm_resource *res = lock->l_resource; + struct ldlm_interval *node; + struct ldlm_interval_tree *tree; + int idx; + + if (lock->l_granted_mode != lock->l_req_mode) + return; + + LASSERT(lock->l_tree_node != NULL); + idx = lock_mode_to_index(lock->l_granted_mode); + LASSERT(lock->l_granted_mode == 1 << idx); + tree = &res->lr_itree[idx]; + + LASSERT(tree->lit_root != NULL); /* assure the tree is not null */ + + tree->lit_size--; + node = ldlm_interval_detach(lock); + if (node) { + interval_erase(&node->li_node, &tree->lit_root); + ldlm_interval_free(node); + } +} diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index 294ba72..3127e8c 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -112,6 +112,8 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, int *flags, int first_enq, /* ldlm_extent.c */ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq, ldlm_error_t *err, struct list_head *work_list); +void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock); +void ldlm_extent_unlink_lock(struct ldlm_lock *lock); /* ldlm_flock.c */ int ldlm_process_flock_lock(struct ldlm_lock *req, int *flags, int first_enq, @@ -137,6 +139,23 @@ struct ldlm_state { struct ldlm_bl_pool *ldlm_bl_pool; }; +/* interval tree, for LDLM_EXTENT. */ +extern cfs_mem_cache_t *ldlm_interval_slab; /* slab cache for ldlm_interval */ +extern void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l); +extern struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l); +extern struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock); +extern void ldlm_interval_free(struct ldlm_interval *node); +/* this function must be called with res lock held */ +static inline struct ldlm_extent * +ldlm_interval_extent(struct ldlm_interval *node) +{ + struct ldlm_lock *lock; + LASSERT(!list_empty(&node->li_group)); + + lock = list_entry(node->li_group.next, struct ldlm_lock, l_sl_policy); + return &lock->l_policy_data.l_extent; +} + int ldlm_init(void); void ldlm_exit(void); diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index abc77f6..d541298 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -162,6 +162,7 @@ void ldlm_lock_put(struct ldlm_lock *lock) if (lock->l_lvb_data != NULL) OBD_FREE(lock->l_lvb_data, lock->l_lvb_len); + ldlm_interval_free(ldlm_interval_detach(lock)); OBD_FREE_RCU_CB(lock, sizeof(*lock), &lock->l_handle, ldlm_lock_free); } @@ -879,6 +880,8 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list) lock->l_granted_mode = lock->l_req_mode; if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) ldlm_grant_lock_with_skiplist(lock); + else if (res->lr_type == LDLM_EXTENT) + ldlm_extent_add_lock(res, lock); else ldlm_resource_add_lock(res, &res->lr_granted, lock); @@ -1136,16 +1139,28 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, lock->l_glimpse_ast = glimpse; lock->l_pid = cfs_curproc_pid(); + lock->l_tree_node = NULL; + /* if this is the extent lock, allocate the interval tree node */ + if (type == LDLM_EXTENT) { + if (ldlm_interval_alloc(lock) == NULL) + GOTO(out, 0); + } + if (lvb_len) { lock->l_lvb_len = lvb_len; OBD_ALLOC(lock->l_lvb_data, lvb_len); - if (lock->l_lvb_data == NULL) { - OBD_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock)); - RETURN(NULL); - } + if (lock->l_lvb_data == NULL) + GOTO(out, 0); } RETURN(lock); + +out: + if (lock->l_lvb_data) + OBD_FREE(lock->l_lvb_data, lvb_len); + ldlm_interval_free(ldlm_interval_detach(lock)); + OBD_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock)); + return NULL; } ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, @@ -1157,6 +1172,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, int local = ns_is_client(res->lr_namespace); ldlm_processing_policy policy; ldlm_error_t rc = ELDLM_OK; + struct ldlm_interval *node = NULL; ENTRY; do_gettimeofday(&lock->l_enqueued_time); @@ -1183,16 +1199,35 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, } } + /* For a replaying lock, it might be already in granted list. So + * unlinking the lock will cause the interval node to be freed, we + * have to allocate the interval node early otherwise we can't regrant + * this lock in the future. - jay */ + if (!local && (*flags & LDLM_FL_REPLAY) && res->lr_type == LDLM_EXTENT) + OBD_SLAB_ALLOC(node, ldlm_interval_slab, CFS_ALLOC_IO, + sizeof(*node)); + lock_res_and_lock(lock); if (local && lock->l_req_mode == lock->l_granted_mode) { - /* The server returned a blocked lock, but it was granted before - * we got a chance to actually enqueue it. We don't need to do - * anything else. */ + /* The server returned a blocked lock, but it was granted + * before we got a chance to actually enqueue it. We don't + * need to do anything else. */ *flags &= ~(LDLM_FL_BLOCK_GRANTED | LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT); GOTO(out, ELDLM_OK); } + ldlm_resource_unlink_lock(lock); + if (res->lr_type == LDLM_EXTENT && lock->l_tree_node == NULL) { + if (node == NULL) { + ldlm_lock_destroy_nolock(lock); + GOTO(out, rc = -ENOMEM); + } + + ldlm_interval_attach(node, lock); + node = NULL; + } + /* Some flags from the enqueue want to make it into the AST, via the * lock's l_flags. */ lock->l_flags |= *flags & LDLM_AST_DISCARD_DATA; @@ -1208,7 +1243,6 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, * * FIXME (bug 268): Detect obvious lies by checking compatibility in * granted/converting queues. */ - ldlm_resource_unlink_lock(lock); if (local) { if (*flags & LDLM_FL_BLOCK_CONV) ldlm_resource_add_lock(res, &res->lr_converting, lock); @@ -1236,6 +1270,8 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, GOTO(out, rc); out: unlock_res_and_lock(lock); + if (node) + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); return rc; } @@ -1656,6 +1692,7 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, struct ldlm_lock *mark_lock = NULL; int join = LDLM_JOIN_NONE; ldlm_error_t err; + struct ldlm_interval *node; ENTRY; if (new_mode == lock->l_granted_mode) { // No changes? Just return. @@ -1663,6 +1700,12 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, RETURN(lock->l_resource); } + /* I can't check the type of lock here because the bitlock of lock + * is not held here, so do the allocation blindly. -jay */ + OBD_SLAB_ALLOC(node, ldlm_interval_slab, CFS_ALLOC_IO, sizeof(*node)); + if (node == NULL) /* Actually, this causes EDEADLOCK to be returned */ + RETURN(NULL); + LASSERTF(new_mode == LCK_PW && lock->l_granted_mode == LCK_PR, "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode); @@ -1698,8 +1741,17 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, else if (lock->l_res_link.next != &res->lr_granted) mark_lock = list_entry(lock->l_res_link.next, struct ldlm_lock, l_res_link); + } else { + ldlm_resource_unlink_lock(lock); + if (res->lr_type == LDLM_EXTENT) { + /* FIXME: ugly code, I have to attach the lock to a + * interval node again since perhaps it will be granted + * soon */ + CFS_INIT_LIST_HEAD(&node->li_group); + ldlm_interval_attach(node, lock); + node = NULL; + } } - ldlm_resource_unlink_lock(lock); /* If this is a local resource, put it on the appropriate list. */ if (ns_is_client(res->lr_namespace)) { @@ -1725,7 +1777,11 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, rc = policy(lock, &pflags, 0, &err, &rpc_list); if (rc == LDLM_ITER_STOP) { lock->l_req_mode = old_mode; - ldlm_granted_list_add_lock(lock, mark_lock, join); + if (res->lr_type == LDLM_EXTENT) + ldlm_extent_add_lock(res, lock); + else + ldlm_granted_list_add_lock(lock, mark_lock, + join); res = NULL; } else { *flags |= LDLM_FL_BLOCK_GRANTED; @@ -1736,6 +1792,8 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, if (granted) ldlm_run_ast_work(&rpc_list, LDLM_WORK_CP_AST); + if (node) + OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); RETURN(res); } diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index cc802d1..1b0437a 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1340,7 +1340,26 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, LDLM_DEBUG(lock, "client completion callback handler START"); + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) { + int to = cfs_time_seconds(1); + while (to > 0) { + to = schedule_timeout(to); + if (lock->l_granted_mode == lock->l_req_mode || + lock->l_destroyed) + break; + } + } + lock_res_and_lock(lock); + if (lock->l_destroyed || + lock->l_granted_mode == lock->l_req_mode) { + /* bug 11300: the lock has already been granted */ + unlock_res_and_lock(lock); + LDLM_DEBUG(lock, "Double grant race happened"); + LDLM_LOCK_PUT(lock); + EXIT; + return; + } /* If we receive the completion AST before the actual enqueue returned, * then we might need to switch lock modes, resources, or extents. */ @@ -1633,6 +1652,15 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) RETURN(0); } + /* Force a known safe race, send a cancel to the server for a lock + * which the server has already started a blocking callback on. */ + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) && + lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { + rc = ldlm_cli_cancel(&dlm_req->lock_handle[0]); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } + lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle[0]); if (!lock) { CDEBUG(D_DLMTRACE, "callback on lock "LPX64" - lock " @@ -2172,6 +2200,15 @@ int __init ldlm_init(void) return -ENOMEM; } + ldlm_interval_slab = cfs_mem_cache_create("interval_node", + sizeof(struct ldlm_interval), + 0, SLAB_HWCACHE_ALIGN); + if (ldlm_interval_slab == NULL) { + cfs_mem_cache_destroy(ldlm_resource_slab); + cfs_mem_cache_destroy(ldlm_lock_slab); + return -ENOMEM; + } + return 0; } @@ -2184,6 +2221,8 @@ void __exit ldlm_exit(void) LASSERTF(rc == 0, "couldn't free ldlm resource slab\n"); rc = cfs_mem_cache_destroy(ldlm_lock_slab); LASSERTF(rc == 0, "couldn't free ldlm lock slab\n"); + rc = cfs_mem_cache_destroy(ldlm_interval_slab); + LASSERTF(rc == 0, "couldn't free interval node slab\n"); } /* ldlm_extent.c */ diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 2f74265..b87ab5f 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -622,6 +622,7 @@ static __u32 ldlm_hash_fn(struct ldlm_resource *parent, static struct ldlm_resource *ldlm_resource_new(void) { struct ldlm_resource *res; + int idx; OBD_SLAB_ALLOC(res, ldlm_resource_slab, CFS_ALLOC_IO, sizeof *res); if (res == NULL) @@ -634,6 +635,14 @@ static struct ldlm_resource *ldlm_resource_new(void) CFS_INIT_LIST_HEAD(&res->lr_granted); CFS_INIT_LIST_HEAD(&res->lr_converting); CFS_INIT_LIST_HEAD(&res->lr_waiting); + + /* initialize interval trees for each lock mode*/ + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + res->lr_itree[idx].lit_size = 0; + res->lr_itree[idx].lit_mode = 1 << idx; + res->lr_itree[idx].lit_root = NULL; + } + atomic_set(&res->lr_refcount, 1); spin_lock_init(&res->lr_lock); @@ -905,8 +914,13 @@ void ldlm_resource_insert_lock_after(struct ldlm_lock *original, void ldlm_resource_unlink_lock(struct ldlm_lock *lock) { + int type = lock->l_resource->lr_type; + check_res_locked(lock->l_resource); - ldlm_unlink_lock_skiplist(lock); + if (type == LDLM_IBITS || type == LDLM_PLAIN) + ldlm_unlink_lock_skiplist(lock); + else if (type == LDLM_EXTENT) + ldlm_extent_unlink_lock(lock); list_del_init(&lock->l_res_link); } diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 399698f..275d523 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1566,6 +1566,53 @@ static int filter_destroy_internal(struct obd_device *obd, obd_id objid, return(rc); } +struct filter_intent_args { + struct ldlm_lock **victim; + __u64 size; + int *liblustre; +}; + +static enum interval_iter filter_intent_cb(struct interval_node *n, + void *args) +{ + struct ldlm_interval *node = (struct ldlm_interval *)n; + struct filter_intent_args *arg = (struct filter_intent_args*)args; + __u64 size = arg->size; + struct ldlm_lock **v = arg->victim; + struct ldlm_lock *lck; + + /* If the interval is lower than the current file size, + * just break. */ + if (interval_high(n) <= size) + return INTERVAL_ITER_STOP; + + list_for_each_entry(lck, &node->li_group, l_sl_policy) { + /* Don't send glimpse ASTs to liblustre clients. + * They aren't listening for them, and they do + * entirely synchronous I/O anyways. */ + if (lck->l_export == NULL || + lck->l_export->exp_libclient == 1) + continue; + + if (*arg->liblustre) + *arg->liblustre = 0; + + if (*v == NULL) { + *v = LDLM_LOCK_GET(lck); + } else if ((*v)->l_policy_data.l_extent.start < + lck->l_policy_data.l_extent.start) { + LDLM_LOCK_PUT(*v); + *v = LDLM_LOCK_GET(lck); + } + + /* the same policy group - every lock has the + * same extent, so needn't do it any more */ + break; + } + + return INTERVAL_ITER_CONT; +} + static int filter_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp, void *req_cookie, ldlm_mode_t mode, int flags, void *data) @@ -1577,9 +1624,10 @@ static int filter_intent_policy(struct ldlm_namespace *ns, ldlm_processing_policy policy; struct ost_lvb *res_lvb, *reply_lvb; struct ldlm_reply *rep; - struct list_head *tmp; ldlm_error_t err; - int rc, tmpflags = 0, only_liblustre = 0; + int idx, rc, tmpflags = 0, only_liblustre = 1; + struct ldlm_interval_tree *tree; + struct filter_intent_args arg; int repsize[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREPLY_OFF] = sizeof(*rep), [DLM_REPLY_REC_OFF] = sizeof(*reply_lvb) }; @@ -1604,7 +1652,9 @@ static int filter_intent_policy(struct ldlm_namespace *ns, /* If we grant any lock at all, it will be a whole-file read lock. * Call the extent policy function to see if our request can be - * granted, or is blocked. */ + * granted, or is blocked. + * If the OST lock has LDLM_FL_HAS_INTENT set, it means a glimpse lock + */ lock->l_policy_data.l_extent.start = 0; lock->l_policy_data.l_extent.end = OBD_OBJECT_EOF; lock->l_req_mode = LCK_PR; @@ -1652,42 +1702,23 @@ static int filter_intent_policy(struct ldlm_namespace *ns, LASSERT(res_lvb != NULL); *reply_lvb = *res_lvb; - list_for_each(tmp, &res->lr_granted) { - struct ldlm_lock *tmplock = - list_entry(tmp, struct ldlm_lock, l_res_link); - - if (tmplock->l_granted_mode == LCK_PR) - continue; - /* - * ->ns_lock guarantees that no new locks are granted, and, - * therefore, that res->lr_lvb_data cannot increase beyond the - * end of already granted lock. As a result, it is safe to - * check against "stale" reply_lvb->lvb_size value without - * res->lr_lvb_sem. - */ - if (tmplock->l_policy_data.l_extent.end <= reply_lvb->lvb_size) - continue; - - /* Don't send glimpse ASTs to liblustre clients. They aren't - * listening for them, and they do entirely synchronous I/O - * anyways. */ - if (tmplock->l_export == NULL || - tmplock->l_export->exp_libclient == 1) { - only_liblustre = 1; - continue; - } - - if (l == NULL) { - l = LDLM_LOCK_GET(tmplock); - continue; - } - - if (l->l_policy_data.l_extent.start > - tmplock->l_policy_data.l_extent.start) + /* + * ->ns_lock guarantees that no new locks are granted, and, + * therefore, that res->lr_lvb_data cannot increase beyond the + * end of already granted lock. As a result, it is safe to + * check against "stale" reply_lvb->lvb_size value without + * res->lr_lvb_sem. + */ + arg.size = reply_lvb->lvb_size; + arg.victim = &l; + arg.liblustre = &only_liblustre; + for (idx = 0; idx < LCK_MODE_NUM; idx++) { + tree = &res->lr_itree[idx]; + if (tree->lit_mode == LCK_PR) continue; - LDLM_LOCK_PUT(l); - l = LDLM_LOCK_GET(tmplock); + interval_iterate_reverse(tree->lit_root, + filter_intent_cb, &arg); } unlock_res(res); diff --git a/lustre/ptlrpc/Makefile.in b/lustre/ptlrpc/Makefile.in index caab57d..9ade7d5 100644 --- a/lustre/ptlrpc/Makefile.in +++ b/lustre/ptlrpc/Makefile.in @@ -10,6 +10,7 @@ ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o ldlm_objs += $(LDLM)ldlm_pool.o +ldlm_objs += $(LDLM)interval_tree.o ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o recov_thread.o ptlrpc_objs += llog_net.o llog_client.o llog_server.o import.o ptlrpcd.o @@ -29,6 +30,9 @@ ldlm_%.c: @LUSTRE@/ldlm/ldlm_%.c l_lock.c: @LUSTRE@/ldlm/l_lock.c ln -sf $< $@ +interval_tree.c: @LUSTRE@/ldlm/interval_tree.c + ln -sf $< $@ + EXTRA_PRE_CFLAGS := -I@LUSTRE@/ldlm @INCLUDE_RULES@ diff --git a/lustre/ptlrpc/autoMakefile.am b/lustre/ptlrpc/autoMakefile.am index 7d4754e..d94f0e4 100644 --- a/lustre/ptlrpc/autoMakefile.am +++ b/lustre/ptlrpc/autoMakefile.am @@ -5,6 +5,7 @@ LDLM_COMM_SOURCES= $(top_srcdir)/lustre/ldlm/l_lock.c \ $(top_srcdir)/lustre/ldlm/ldlm_lock.c \ + $(top_srcdir)/lustre/ldlm/interval_tree.c \ $(top_srcdir)/lustre/ldlm/ldlm_resource.c \ $(top_srcdir)/lustre/ldlm/ldlm_lib.c \ $(top_srcdir)/lustre/ldlm/ldlm_plain.c \ @@ -86,4 +87,4 @@ endif install-data-hook: $(install_data_hook) DIST_SOURCES = $(ptlrpc_objs:.o=.c) ptlrpc_internal.h -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ ldlm_*.c l_lock.c interval_tree.c diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index d187035..6fe15c5 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -185,6 +185,8 @@ void lustre_assert_wire_constants(void) (long long)LCK_GROUP); LASSERTF(LCK_MAXMODE == 65, " found %lld\n", (long long)LCK_MAXMODE); + LASSERTF(LCK_MODE_NUM == 7, " found %lld\n", + (long long)LCK_MODE_NUM); CLASSERT(LDLM_PLAIN == 10); CLASSERT(LDLM_EXTENT == 11); CLASSERT(LDLM_FLOCK == 12); diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 6854a75..7a565f2 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -26,7 +26,7 @@ EXTRA_DIST = $(noinst_SCRIPTS) $(noinst_DATA) \ $(nobase_noinst_SCRIPTS) $(nobase_noinst_DATA) if TESTS -noinst_PROGRAMS = openunlink truncate directio openme writeme mlink utime +noinst_PROGRAMS = openunlink truncate directio openme writeme mlink utime it_test noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy noinst_PROGRAMS += createmany chownmany statmany multifstat createtest noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test checkstat diff --git a/lustre/tests/it_test.c b/lustre/tests/it_test.c new file mode 100644 index 0000000..dae233b --- /dev/null +++ b/lustre/tests/it_test.c @@ -0,0 +1,519 @@ +/* vi:set ts=8 sw=8 expandtab: */ +/* Unit test tool for interval tree. + * Written by jay + */ +#include +#include +#include +#include + +#include <../ldlm/interval_tree.c> + +#define dprintf(fmt, args...) //printf(fmt, ##args) +#define error(fmt, args...) do { \ + fflush(stdout), fflush(stderr); \ + fprintf(stderr, "\nError:" fmt, ##args); \ + abort(); \ +} while(0) + +#define __F(ext) (ext)->start, (ext)->end +#define __S "[%llx:%llx]" + +#define ALIGN_SIZE 4096 +#define ALIGN_MASK (~(ALIGN_SIZE - 1)) + +static struct it_node { + struct interval_node node; + struct list_head list; + int hit, valid; +} *it_array; +static int it_count; +static struct list_head header = LIST_HEAD_INIT(header); +static unsigned long max_count = ULONG_MAX & ALIGN_MASK; +static int have_wide_lock = 0; + +static void it_test_clear(void) +{ + int i = 0; + for (i = 0; i < it_count; i++) + it_array[i].hit = 0; +} + +static enum interval_iter cb(struct interval_node *n, void *args) +{ + struct it_node *node = (struct it_node *)n; + static int count = 1; + + if (node->hit == 1) { + dprintf("NODE "__S" has ever been accessed\n", + __F(&n->in_extent)); + return INTERVAL_ITER_CONT; + error("duplicate node accessing found\n"); + return INTERVAL_ITER_STOP; + } + + if (node->valid == 0) { + error("A deleted node "__S" being accessed\n", + __F(&n->in_extent)); + return INTERVAL_ITER_STOP; + } + + if (count++ == 8) { + dprintf("\n"); + count = 1; + } + dprintf(""__S" ", __F(&n->in_extent)); + fflush(stdout); + + node->hit = 1; + return INTERVAL_ITER_CONT; +} + +static int it_test_search(struct interval_node *root) +{ + struct it_node *n; + struct interval_node_extent ext; + int times = 10, i, err = 0; + + while (times--) { + it_test_clear(); + ext.start = (random() % max_count) & ALIGN_MASK; + ext.end = random() % (max_count - ext.start + 2) + ext.start; + ext.end &= ALIGN_MASK; + if (ext.end > max_count) + ext.end = max_count; + + dprintf("\n\nSearching the node overlapped "__S" ..\n", + __F(&ext)); + + interval_search(root, &ext, cb, NULL); + + dprintf("\nverifing ..."); + + /* verify */ + for (i = 0; i < it_count; i++) { + n = &it_array[i]; + if (n->valid == 0) + continue; + + if (extent_overlapped(&ext, &n->node.in_extent) && + n->hit == 0) + error("node "__S" overlaps" __S"," + "but never to be hit.\n", + __F(&n->node.in_extent), + __F(&ext)); + + if (!extent_overlapped(&ext, &n->node.in_extent) && + n->hit) + error("node "__S" overlaps" __S", but hit.\n", + __F(&n->node.in_extent), + __F(&ext)); + } + if (err) error("search error\n"); + dprintf("ok.\n"); + } + + return 0; +} + +static int it_test_iterate(struct interval_node *root) +{ + int i; + + dprintf("\n\nIterate testing start..\n"); + + it_test_clear(); + interval_iterate(root, cb, NULL); + + /* verify */ + for (i = 0; i < it_count; i++) { + if (it_array[i].valid == 0) + continue; + if (it_array[i].hit == 0) + error("Node "__S" is not accessed\n", + __F(&it_array[i].node.in_extent)); + } + + return 0; +} + +static int it_test_iterate_reverse(struct interval_node *root) +{ + int i; + + dprintf("\n\niterate reverse testing start..\n"); + it_test_clear(); + interval_iterate_reverse(root, cb, NULL); + + /* verify */ + for (i = 0; i < it_count; i++) { + if (it_array[i].valid == 0) + continue; + if (it_array[i].hit == 0) + error("Not every extent is accessed\n"); + } + + return 0; +} + +static int it_test_find(struct interval_node *root) +{ + int idx; + struct interval_node_extent *ext; + + dprintf("\ninterval_find testing start ..\n"); + for (idx = 0; idx < it_count; idx++) { + if (it_array[idx].valid == 0) + continue; + + ext = &it_array[idx].node.in_extent; + dprintf("Try to find "__S"\n", __F(ext)); + if (!interval_find(root, ext)) + error("interval_find, try to find "__S"\n", __F(ext)); + } + return 0; +} + +/* sanity test is tightly coupled with implementation, so when you changed + * the interval tree implementation, change this code also. */ +static enum interval_iter sanity_cb(struct interval_node *node, void *args) +{ + __u64 max_high = node->in_max_high; + struct interval_node *tmp, *parent; + int left = 1, has = 0, nr = 0; + + parent = node->in_parent; + node->in_parent = NULL; + interval_for_each(tmp, node) { + if ((left && node_compare(tmp, node) > 0) || + (!left && node_compare(tmp, node) < 0)) + error("interval tree sanity test\n"); + + if (tmp->in_max_high > max_high) { + dprintf("max high sanity check, max_high is %llu," + "child max_high: %llu"__S"\n", + max_high, tmp->in_max_high, + __F(&tmp->in_extent)); + goto err; + } else if (tmp->in_max_high == max_high) { + has = 1; + } + + if (tmp == node) { + left = 0; + continue; + } + } + + if (!has) { + int count = 1; +err: + dprintf("node"__S":%llu Child list:\n", + node->in_extent.start, + node->in_extent.end, + node->in_max_high); + + interval_for_each(tmp, node) { + dprintf(""__S":%llu ", + __F(&tmp->in_extent), + tmp->in_max_high); + if (count++ == 8) { + dprintf("\n"); + count = 1; + } + } + + error("max high sanity check, has == %d\n", has); + } + node->in_parent = parent; + + tmp = node; + while (tmp) { + if (node_is_black(tmp)) + nr++; + else if ((tmp->in_left && node_is_red(tmp->in_left)) || + (tmp->in_right && node_is_red(tmp->in_right))) + error("wrong tree, a red node has red child\n"); + tmp = tmp->in_left; + } + + tmp = node; + while (tmp) { + if (node_is_black(tmp)) + nr--; + tmp = tmp->in_right; + } + if (nr) + error("wrong tree, unbalanced!\n"); + + return 0; +} + +static int it_test_sanity(struct interval_node *root) +{ + it_test_clear(); + interval_iterate(root, sanity_cb, NULL); + return 0; +} + +static int it_test_search_hole(struct interval_node *root) +{ + int i, count = 10; + struct interval_node_extent ext, ext2; + struct it_node *n; + __u64 low = 0, high = ~0; + + do { + if (--count == 0) + return 0; + + ext.start = random() % max_count; + ext.end = ext.start; + } while (interval_is_overlapped(root, &ext)); + ext2 = ext; + + interval_expand(root, &ext, NULL); + dprintf("Extending "__S" to .."__S"\n", __F(&ext2), __F(&ext)); + for (i = 0; i < it_count; i++) { + n = &it_array[i]; + if (n->valid == 0) + continue; + + if (extent_overlapped(&ext, &n->node.in_extent)) { + error("Extending "__S" to .."__S" overlaps node"__S"\n", + __F(&ext2), __F(&ext), __F(&n->node.in_extent)); + } + + if (n->node.in_extent.end < ext2.start) + low = max_u64(n->node.in_extent.end + 1, low); + + if (n->node.in_extent.start > ext2.end) + high = min_u64(n->node.in_extent.start - 1, high); + } + + /* only expanding high right now */ + if (ext2.start != ext.start || high != ext.end) { + ext2.start = low, ext2.end = high; + error("Real extending result:"__S", expected:"__S"\n", + __F(&ext), __F(&ext2)); + } + + return 0; +} + +static int contended_count = 0; +#define LOOP_COUNT 1000 +static enum interval_iter perf_cb(struct interval_node *n, void *args) +{ + unsigned long count = LOOP_COUNT; + while (count--); + contended_count++; + return INTERVAL_ITER_CONT; +} + +static inline long tv_delta(struct timeval *s, struct timeval *e) +{ + long c = e->tv_sec - s->tv_sec; + c *= 1000; + c += (long int)(e->tv_usec - s->tv_usec) / 1000; + dprintf("\tStart: %lu:%lu -> End: %lu:%lu\n", + s->tv_sec, s->tv_usec, e->tv_sec, e->tv_usec); + return c; +} + +static int it_test_performance(struct interval_node *root, unsigned long len) +{ + int i = 0, interval_time, list_time; + struct interval_node_extent ext; + struct it_node *n; + struct timeval start, end; + unsigned long count; + + ext.start = (random() % (max_count - len)) & ALIGN_MASK; + ext.end = (ext.start + len) & ALIGN_MASK; + if (have_wide_lock) { + ext.start = (max_count - len) & ALIGN_MASK; + ext.end = max_count; + } + + dprintf("Extent search"__S"\n", __F(&ext)); + + /* list */ + contended_count = 0; + gettimeofday(&start, NULL); + list_for_each_entry(n, &header, list) { + if (extent_overlapped(&ext, &n->node.in_extent)) { + count = LOOP_COUNT; + while (count--); + contended_count++; + } + } + gettimeofday(&end, NULL); + list_time = tv_delta(&start, &end); + i = contended_count; + + /* interval */ + contended_count = 0; + gettimeofday(&start, NULL); + interval_search(root, &ext, perf_cb, &contended_count); + gettimeofday(&end, NULL); + interval_time = tv_delta(&start, &end); + + if (i != contended_count) + error("count of contended lock don't match(%d: %d)\n", + i, contended_count); + + printf("\tList vs Int. search: \n\t\t" + "(%d vs %d)ms, %d contended lock.\n", + list_time, interval_time, contended_count); + + return 0; +} + +static struct interval_node *it_test_helper(struct interval_node *root) +{ + int idx, count = 0; + struct it_node *n; + + count = random() % it_count; + while (count--) { + idx = random() % it_count; + n = &it_array[idx]; + if (n->valid) { + if (!interval_find(root, &n->node.in_extent)) + error("Cannot find an existent node\n"); + dprintf("Erasing a node "__S"\n", + __F(&n->node.in_extent)); + interval_erase(&n->node, &root); + n->valid = 0; + list_del_init(&n->list); + } else { + __u64 low, high; + low = (random() % max_count) & ALIGN_MASK; + high = ((random() % max_count + 1) & ALIGN_MASK) + low; + if (high > max_count) + high = max_count; + interval_set(&n->node, low, high); + while (interval_insert(&n->node, &root)) + interval_set(&n->node, low, ++high); + dprintf("Adding a node "__S"\n", + __F(&n->node.in_extent)); + n->valid = 1; + list_add(&n->list, &header); + } + } + + return root; +} + +static struct interval_node *it_test_init(int count) +{ + int i; + uint64_t high, low, len; + struct it_node *n; + struct interval_node *root = NULL; + + it_count = count; + it_array = (struct it_node *)malloc(sizeof(struct it_node) * count); + if (it_array == NULL) + error("it_array == NULL, no memory\n"); + + have_wide_lock = 0; + for (i = 0; i < count; i++) { + n = &it_array[i]; + do { + low = (random() % max_count + 1) & ALIGN_MASK; + len = (random() % 256 + 1) * ALIGN_SIZE; + if (!have_wide_lock && !(random() % count)) { + low = 0; + len = max_count; + have_wide_lock = 1; + } + high = low + (len & ALIGN_MASK); + + interval_set(&n->node, low, high); + } while (interval_insert(&n->node, &root)); + n->hit = 0; + n->valid = 1; + if (i == 0) + list_add_tail(&n->list, &header); + else + list_add_tail(&n->list, &it_array[rand()%i].list); + } + + return root; +} + +static void it_test_fini(void) +{ + free(it_array); + it_array = NULL; + it_count = 0; + max_count = 0; +} + +int main(int argc, char *argv[]) +{ + int count = 5, perf = 0; + struct interval_node *root; + struct timeval tv; + + gettimeofday(&tv, NULL); + srandom(tv.tv_usec); + + if (argc == 2) { + if (strcmp(argv[1], "-p")) + error("Unknow options, usage: %s [-p]\n", argv[0]); + perf = 1; + count = 1; + } + + if (perf) { + int M = 1024 * 1024; + root = it_test_init(1000000); + printf("1M locks with 4K request size\n"); + it_test_performance(root, 4096); + printf("1M locks with 128K request size\n"); + it_test_performance(root, 128 * 1024); + printf("1M locks with 256K request size\n"); + it_test_performance(root, 256 * 1024); + printf("1M locks with 1M request size\n"); + it_test_performance(root, 1 * M); + printf("1M locks with 16M request size\n"); + it_test_performance(root, 16 * M); + printf("1M locks with 32M request size\n"); + it_test_performance(root, 32 * M); + printf("1M locks with 64M request size\n"); + it_test_performance(root, 64 * M); + printf("1M locks with 128M request size\n"); + it_test_performance(root, 128 * M); + printf("1M locks with 256M request size\n"); + it_test_performance(root, 256 * M); + printf("1M locks with 512M request size\n"); + it_test_performance(root, 512 * M); + printf("1M locks with 1G request size\n"); + it_test_performance(root, 1024 * M); + printf("1M locks with 2G request size\n"); + it_test_performance(root, 2048 * M); + printf("1M locks with 3G request size\n"); + it_test_performance(root, 3072 * M); + printf("1M locks with 4G request size\n"); + it_test_performance(root, max_count - 1); + it_test_fini(); + return 0; + } + + root = it_test_init(random() % 100000 + 1000); + while (count--) { + it_test_sanity(root); + it_test_iterate(root); + it_test_iterate_reverse(root); + it_test_find(root); + it_test_search_hole(root); + it_test_search(root); + root = it_test_helper(root); + } + it_test_fini(); + + return 0; +} diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 3c9b4b6..0f6b129 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -606,6 +606,18 @@ test_30() { #bug #11110 run_test 30 "recreate file race =========" +test_31() { + mkdir -p $DIR1/$tdir || error "Creating dir $DIR1/$tdir" + writes=`LANG=C dd if=/dev/zero of=$DIR/$tdir/$tfile count=1 2>&1 | + awk 'BEGIN { FS="+" } /out/ {print $1}'` + #define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 + sysctl -w lustre.fail_loc=0x314 + reads=`LANG=C dd if=$DIR2/$tdir/$tfile of=/dev/null 2>&1 | + awk 'BEGIN { FS="+" } /in/ {print $1}'` + [ $reads -eq $writes ] || error "read" $reads "blocks, must be" $writes +} +run_test 31 "voluntary cancel / blocking ast race==============" + log "cleanup: ======================================================" check_and_cleanup_lustre diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 628edfa..e963f86 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -203,6 +203,8 @@ void lustre_assert_wire_constants(void) (long long)LCK_GROUP); LASSERTF(LCK_MAXMODE == 65, " found %lld\n", (long long)LCK_MAXMODE); + LASSERTF(LCK_MODE_NUM == 7, " found %lld\n", + (long long)LCK_MODE_NUM); CLASSERT(LDLM_PLAIN == 10); CLASSERT(LDLM_EXTENT == 11); CLASSERT(LDLM_FLOCK == 12); -- 1.8.3.1