From: Liang Zhen Date: Fri, 8 Jul 2011 17:43:08 +0000 (-0400) Subject: LU-50 ldiskfs: pdirops patch for ldiskfs X-Git-Tag: 2.1.53~8 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=8e858671be59ed53fad2d340cf841b026943cc8a LU-50 ldiskfs: pdirops patch for ldiskfs Single directory performance is a critical for HPC workloads. In a typical use case an application creates a separate output file for each node and task in a job. As nodes and tasks increase, hundreds of thousands of files may be created in a single directory within a short window of time. Today, both filename lookup and file system modifying operations (such as create and unlink) are protected with a single lock for an entire ldiskfs directory. PDO project will remove this bottleneck by introducing a parallel locking mechanism for entire ldiskfs directories. This work will enable multiple application threads to simultaneously lookup, create and unlink in parallel. This patch contains: - pdirops support for ldiskfs - N-level htree directory - integrate with osd-ldiskfs Change-Id: I269c0e3112e68f3acd79e860dab052a68c7d7aaa Signed-off-by: Liang Zhen Reviewed-on: http://review.whamcloud.com/375 Tested-by: Hudson Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/build/autoconf/lustre-build-ldiskfs.m4 b/build/autoconf/lustre-build-ldiskfs.m4 index ccb85c9..6092418 100644 --- a/build/autoconf/lustre-build-ldiskfs.m4 +++ b/build/autoconf/lustre-build-ldiskfs.m4 @@ -236,8 +236,16 @@ AC_DEFUN([LB_LDISKFS_DEFINE_OPTIONS], [ AC_DEFINE(HAVE_LDISKFS_OSD, 1, Enable ldiskfs osd) +with_ldiskfs_pdo=no if test $LDISKFS_BACKFS = 'ext4'; then AC_DEFINE(HAVE_EXT4_LDISKFS, 1, [build ext4 based ldiskfs]) + case $LINUXRELEASE in + 2.6.32*) + if test x$RHEL_KERNEL = xyes; then + with_ldiskfs_pdo=yes + AC_DEFINE(HAVE_LDISKFS_PDO, 1, [have ldiskfs PDO patch]) + fi + esac fi LB_LDISKFS_JBD2_JOURNAL_CALLBACK_SET diff --git a/ldiskfs/configure.ac b/ldiskfs/configure.ac index 50918f7..448841b 100644 --- a/ldiskfs/configure.ac +++ b/ldiskfs/configure.ac @@ -121,6 +121,8 @@ BACKFSU=${BACKFS/ext/EXT} AC_SUBST(BACKFSU) # We need a Upper string fi +AM_CONDITIONAL(LDISKFS_PDO, test x$with_ldiskfs_pdo = xyes) + AC_SUBST(ac_configure_args) LB_CONFIG_FILES diff --git a/ldiskfs/kernel_patches/patches/ext4_pdirop-rhel6.patch b/ldiskfs/kernel_patches/patches/ext4_pdirop-rhel6.patch new file mode 100644 index 0000000..103686a --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext4_pdirop-rhel6.patch @@ -0,0 +1,2258 @@ +--- /dev/null 2011-12-14 22:16:16.000000000 +0800 ++++ linux-2.6.32-131.6.1-pdo/include/linux/htree_lock.h 2011-12-02 17:09:34.000000000 +0800 +@@ -0,0 +1,187 @@ ++/* ++ * include/linux/htree_lock.h ++ * ++ * Copyright (c) 2011 Whamcloud, Inc. ++ * ++ * Author: Liang Zhen ++ */ ++ ++/* ++ * htree lock ++ * ++ * htree_lock is an advanced lock, it can support five lock modes (concept is ++ * taken from DLM) and it's a sleeping lock. ++ * ++ * most common use case is: ++ * - create a htree_lock_head for data ++ * - each thread (contender) creates it's own htree_lock ++ * - contender needs to call htree_lock(lock_node, mode) to protect data and ++ * call htree_unlock to release lock ++ * ++ * Also, there is advanced use-case which is more complex, user can have ++ * PW/PR lock on particular key, it's mostly used while user holding shared ++ * lock on the htree (CW, CR) ++ * ++ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR ++ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR ++ * ... ++ * htree_node_unlock(lock_node);; unlock the key ++ * ++ * Another tip is, we can have N-levels of this kind of keys, all we need to ++ * do is specifying N-levels while creating htree_lock_head, then we can ++ * lock/unlock a specific level by: ++ * htree_node_lock(lock_node, mode1, key1, level1...); ++ * do something; ++ * htree_node_lock(lock_node, mode1, key2, level2...); ++ * do something; ++ * htree_node_unlock(lock_node, level2); ++ * htree_node_unlock(lock_node, level1); ++ * ++ * NB: for multi-level, should be careful about locking order to avoid deadlock ++ */ ++ ++#ifndef _LINUX_HTREE_LOCK_H ++#define _LINUX_HTREE_LOCK_H ++ ++#include ++#include ++#include ++ ++/* ++ * Lock Modes ++ * more details can be found here: ++ * http://en.wikipedia.org/wiki/Distributed_lock_manager ++ */ ++typedef enum { ++ HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */ ++ HTREE_LOCK_PW, /* protected write: allows only CR users */ ++ HTREE_LOCK_PR, /* protected read: allow PR, CR users */ ++ HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */ ++ HTREE_LOCK_CR, /* concurrent read: allow all but EX users */ ++ HTREE_LOCK_MAX, /* number of lock modes */ ++} htree_lock_mode_t; ++ ++#define HTREE_LOCK_NL HTREE_LOCK_MAX ++#define HTREE_LOCK_INVAL 0xdead10c ++ ++enum { ++ HTREE_HBITS_MIN = 2, ++ HTREE_HBITS_DEF = 14, ++ HTREE_HBITS_MAX = 32, ++}; ++ ++enum { ++ HTREE_EVENT_DISABLE = (0), ++ HTREE_EVENT_RD = (1 << HTREE_LOCK_PR), ++ HTREE_EVENT_WR = (1 << HTREE_LOCK_PW), ++ HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR), ++}; ++ ++struct htree_lock; ++ ++typedef void (*htree_event_cb_t)(void *target, void *event); ++ ++struct htree_lock_child { ++ struct list_head lc_list; /* granted list */ ++ htree_event_cb_t lc_callback; /* event callback */ ++ unsigned lc_events; /* event types */ ++}; ++ ++struct htree_lock_head { ++ unsigned long lh_lock; /* bits lock */ ++ /* blocked lock list (htree_lock) */ ++ struct list_head lh_blocked_list; ++ /* # key levels */ ++ u16 lh_depth; ++ /* hash bits for key and limit number of locks */ ++ u16 lh_hbits; ++ /* counters for blocked locks */ ++ u16 lh_nblocked[HTREE_LOCK_MAX]; ++ /* counters for granted locks */ ++ u16 lh_ngranted[HTREE_LOCK_MAX]; ++ /* private data */ ++ void *lh_private; ++ /* array of children locks */ ++ struct htree_lock_child lh_children[0]; ++}; ++ ++/* htree_lock_node_t is child-lock for a specific key (ln_value) */ ++struct htree_lock_node { ++ htree_lock_mode_t ln_mode; ++ /* major hash key */ ++ u16 ln_major_key; ++ /* minor hash key */ ++ u16 ln_minor_key; ++ struct list_head ln_major_list; ++ struct list_head ln_minor_list; ++ /* alive list, all locks (granted, blocked, listening) are on it */ ++ struct list_head ln_alive_list; ++ /* blocked list */ ++ struct list_head ln_blocked_list; ++ /* granted list */ ++ struct list_head ln_granted_list; ++ void *ln_ev_target; ++}; ++ ++struct htree_lock { ++ struct task_struct *lk_task; ++ struct htree_lock_head *lk_head; ++ void *lk_private; ++ unsigned lk_depth; ++ htree_lock_mode_t lk_mode; ++ struct list_head lk_blocked_list; ++ struct htree_lock_node lk_nodes[0]; ++}; ++ ++/* create a lock head, which stands for a resource */ ++struct htree_lock_head *htree_lock_head_alloc(unsigned depth, ++ unsigned hbits, unsigned priv); ++/* free a lock head */ ++void htree_lock_head_free(struct htree_lock_head *lhead); ++/* register event callback for child lock at level @depth */ ++void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth, ++ unsigned events, htree_event_cb_t callback); ++/* create a lock handle, which stands for a thread */ ++struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes); ++/* free a lock handle */ ++void htree_lock_free(struct htree_lock *lck); ++/* lock htree, when @wait is true, 0 is returned if the lock can't ++ * be granted immediately */ ++int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait); ++/* unlock htree */ ++void htree_unlock(struct htree_lock *lck); ++/* unlock and relock htree with @new_mode */ ++int htree_change_lock_try(struct htree_lock *lck, ++ htree_lock_mode_t new_mode, int wait); ++void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode); ++/* require child lock (key) of htree at level @dep, @event will be sent to all ++ * listeners on this @key while lock being granted */ ++int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event); ++/* release child lock at level @dep, this lock will listen on it's key ++ * if @event isn't NULL, event_cb will be called against @lck while granting ++ * any other lock at level @dep with the same key */ ++void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event); ++/* stop listening on child lock at level @dep */ ++void htree_node_stop_listen(struct htree_lock *lck, unsigned dep); ++/* for debug */ ++void htree_lock_stat_print(int depth); ++void htree_lock_stat_reset(void); ++ ++#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1) ++#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1) ++ ++#define htree_lock_mode(lck) ((lck)->lk_mode) ++ ++#define htree_node_lock(lck, mode, key, dep) \ ++ htree_node_lock_try(lck, mode, key, dep, 1, NULL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_granted(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \ ++ (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_listening(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL) ++ ++#endif +--- /dev/null 2011-12-14 22:16:16.000000000 +0800 ++++ linux-2.6.32-131.6.1-pdo/fs/ext4/htree_lock.c 2011-12-14 22:56:28.000000000 +0800 +@@ -0,0 +1,880 @@ ++/* ++ * fs/ext4/htree_lock.c ++ * ++ * Copyright (c) 2011 Whamcloud, Inc. ++ * ++ * Author: Liang Zhen ++ */ ++#include ++#include ++#include ++#include ++ ++enum { ++ HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX), ++ HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW), ++ HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR), ++ HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW), ++ HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR), ++}; ++ ++enum { ++ HTREE_LOCK_COMPAT_EX = 0, ++ HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR, ++ HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR, ++ HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW, ++ HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR | ++ HTREE_LOCK_BIT_PW, ++}; ++ ++static int htree_lock_compat[] = { ++ [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX, ++ [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW, ++ [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR, ++ [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW, ++ [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR, ++}; ++ ++/* max allowed htree-lock depth. ++ * We only need depth=3 for ext4 although user can have higher value. */ ++#define HTREE_LOCK_DEP_MAX 16 ++ ++#ifdef HTREE_LOCK_DEBUG ++ ++static char *hl_name[] = { ++ [HTREE_LOCK_EX] "EX", ++ [HTREE_LOCK_PW] "PW", ++ [HTREE_LOCK_PR] "PR", ++ [HTREE_LOCK_CW] "CW", ++ [HTREE_LOCK_CR] "CR", ++}; ++ ++/* lock stats */ ++struct htree_lock_node_stats { ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long retried[HTREE_LOCK_MAX]; ++ unsigned long long events; ++}; ++ ++struct htree_lock_stats { ++ struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++}; ++ ++static struct htree_lock_stats hl_stats; ++ ++void htree_lock_stat_reset(void) ++{ ++ memset(&hl_stats, 0, sizeof(hl_stats)); ++} ++ ++void htree_lock_stat_print(int depth) ++{ ++ int i; ++ int j; ++ ++ printk(KERN_DEBUG "HTREE LOCK STATS:\n"); ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n", ++ hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]); ++ } ++ for (i = 0; i < depth; i++) { ++ printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i); ++ for (j = 0; j < HTREE_LOCK_MAX; j++) { ++ printk(KERN_DEBUG ++ "[%s]: G [%10llu], B [%10llu], R [%10llu]\n", ++ hl_name[j], hl_stats.nodes[i].granted[j], ++ hl_stats.nodes[i].blocked[j], ++ hl_stats.nodes[i].retried[j]); ++ } ++ } ++} ++ ++#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0) ++#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0) ++#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0) ++#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0) ++#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0) ++#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0) ++ ++#else /* !DEBUG */ ++ ++void htree_lock_stat_reset(void) {} ++void htree_lock_stat_print(int depth) {} ++ ++#define lk_grant_inc(m) do {} while (0) ++#define lk_block_inc(m) do {} while (0) ++#define ln_grant_inc(d, m) do {} while (0) ++#define ln_block_inc(d, m) do {} while (0) ++#define ln_retry_inc(d, m) do {} while (0) ++#define ln_event_inc(d) do {} while (0) ++ ++#endif /* DEBUG */ ++ ++EXPORT_SYMBOL(htree_lock_stat_reset); ++EXPORT_SYMBOL(htree_lock_stat_print); ++ ++#define HTREE_DEP_ROOT (-1) ++ ++#define htree_spin_lock(lhead, dep) \ ++ bit_spin_lock((dep) + 1, &(lhead)->lh_lock) ++#define htree_spin_unlock(lhead, dep) \ ++ bit_spin_unlock((dep) + 1, &(lhead)->lh_lock) ++ ++#define htree_key_event_ignore(child, ln) \ ++ (!((child)->lc_events & (1 << (ln)->ln_mode))) ++ ++static int ++htree_key_list_empty(struct htree_lock_node *ln) ++{ ++ return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list); ++} ++ ++static void ++htree_key_list_del_init(struct htree_lock_node *ln) ++{ ++ struct htree_lock_node *tmp = NULL; ++ ++ if (!list_empty(&ln->ln_minor_list)) { ++ tmp = list_entry(ln->ln_minor_list.next, ++ struct htree_lock_node, ln_minor_list); ++ list_del_init(&ln->ln_minor_list); ++ } ++ ++ if (list_empty(&ln->ln_major_list)) ++ return; ++ ++ if (tmp == NULL) { /* not on minor key list */ ++ list_del_init(&ln->ln_major_list); ++ } else { ++ BUG_ON(!list_empty(&tmp->ln_major_list)); ++ list_replace_init(&ln->ln_major_list, &tmp->ln_major_list); ++ } ++} ++ ++static void ++htree_key_list_replace_init(struct htree_lock_node *old, ++ struct htree_lock_node *new) ++{ ++ if (!list_empty(&old->ln_major_list)) ++ list_replace_init(&old->ln_major_list, &new->ln_major_list); ++ ++ if (!list_empty(&old->ln_minor_list)) ++ list_replace_init(&old->ln_minor_list, &new->ln_minor_list); ++} ++ ++static void ++htree_key_event_enqueue(struct htree_lock_child *child, ++ struct htree_lock_node *ln, int dep, void *event) ++{ ++ struct htree_lock_node *tmp; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ BUG_ON(ln->ln_mode == HTREE_LOCK_NL); ++ if (event == NULL || htree_key_event_ignore(child, ln)) ++ return; ++ ++ /* shouldn't be a very long list */ ++ list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) { ++ if (tmp->ln_mode == HTREE_LOCK_NL) { ++ ln_event_inc(dep); ++ if (child->lc_callback != NULL) ++ child->lc_callback(tmp->ln_ev_target, event); ++ } ++ } ++} ++ ++static int ++htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk, ++ unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_child *child = &newlk->lk_head->lh_children[dep]; ++ struct htree_lock_node *newln = &newlk->lk_nodes[dep]; ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ /* NB: we only expect PR/PW lock mode at here, only these two modes are ++ * allowed for htree_node_lock(asserted in htree_node_lock_internal), ++ * NL is only used for listener, user can't directly require NL mode */ ++ if ((curln->ln_mode == HTREE_LOCK_NL) || ++ (curln->ln_mode != HTREE_LOCK_PW && ++ newln->ln_mode != HTREE_LOCK_PW)) { ++ /* no conflict, attach it on granted list of @curlk */ ++ if (curln->ln_mode != HTREE_LOCK_NL) { ++ list_add(&newln->ln_granted_list, ++ &curln->ln_granted_list); ++ } else { ++ /* replace key owner */ ++ htree_key_list_replace_init(curln, newln); ++ } ++ ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ htree_key_event_enqueue(child, newln, dep, event); ++ ln_grant_inc(dep, newln->ln_mode); ++ return 1; /* still hold lh_lock */ ++ } ++ ++ if (!wait) { /* can't grant and don't want to wait */ ++ ln_retry_inc(dep, newln->ln_mode); ++ newln->ln_mode = HTREE_LOCK_INVAL; ++ return -1; /* don't wait and just return -1 */ ++ } ++ ++ newlk->lk_task = current; ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ /* conflict, attach it on blocked list of curlk */ ++ list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list); ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ ln_block_inc(dep, newln->ln_mode); ++ ++ htree_spin_unlock(newlk->lk_head, dep); ++ /* wait to be given the lock */ ++ if (newlk->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt, wake up will set me RUNNING */ ++ if (event == NULL || htree_key_event_ignore(child, newln)) ++ return 0; /* granted without lh_lock */ ++ ++ htree_spin_lock(newlk->lk_head, dep); ++ htree_key_event_enqueue(child, newln, dep, event); ++ return 1; /* still hold lh_lock */ ++} ++ ++/* ++ * get PR/PW access to particular tree-node according to @dep and @key, ++ * it will return -1 if @wait is false and can't immediately grant this lock. ++ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get ++ * @event if it's not NULL. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck, ++ htree_lock_mode_t mode, u32 key, unsigned dep, ++ int wait, void *event) ++{ ++ LIST_HEAD (list); ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ u16 major; ++ u16 minor; ++ u8 reverse; ++ u8 ma_bits; ++ u8 mi_bits; ++ ++ BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR); ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ ++ key = hash_long(key, lhead->lh_hbits); ++ ++ mi_bits = lhead->lh_hbits >> 1; ++ ma_bits = lhead->lh_hbits - mi_bits; ++ ++ lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1); ++ lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits; ++ lck->lk_nodes[dep].ln_mode = mode; ++ ++ /* ++ * The major key list is an ordered list, so searches are started ++ * at the end of the list that is numerically closer to major_key, ++ * so at most half of the list will be walked (for well-distributed ++ * keys). The list traversal aborts early if the expected key ++ * location is passed. ++ */ ++ reverse = (major >= (1 << (ma_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp, ++ &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key < major) { ++ /* attach _after_ @tmp */ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ ++ } else { ++ list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key > major) { ++ /* insert _before_ @tmp */ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ } ++ ++ search_minor: ++ /* ++ * NB: minor_key list doesn't have a "head", @list is just a ++ * temporary stub for helping list searching, make sure it's removed ++ * after searching. ++ * minor_key list is an ordered list too. ++ */ ++ list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list); ++ ++ reverse = (minor >= (1 << (mi_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) { ++ /* attach _after_ @tmp2 */ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, &list); ++ ++ } else { ++ list_for_each_entry(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) { ++ /* insert _before_ @tmp2 */ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list); ++ } ++ ++ out_grant_minor: ++ if (list.next == &lck->lk_nodes[dep].ln_minor_list) { ++ /* new lock @lck is the first one on minor_key list, which ++ * means it has the smallest minor_key and it should ++ * replace @tmp as minor_key owner */ ++ list_replace_init(&tmp->lk_nodes[dep].ln_major_list, ++ &lck->lk_nodes[dep].ln_major_list); ++ } ++ /* remove the temporary head */ ++ list_del(&list); ++ ++ out_grant_major: ++ ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode); ++ return 1; /* granted with holding lh_lock */ ++ ++ out_enqueue: ++ list_del(&list); /* remove temprary head */ ++ return htree_node_lock_enqueue(lck, tmp2, dep, wait, event); ++} ++ ++/* ++ * release the key of @lck at level @dep, and grant any blocked locks. ++ * caller will still listen on @key if @event is not NULL, which means ++ * caller can see a event (by event_cb) while granting any lock with ++ * the same key at level @dep. ++ * NB: ALWAYS called holding lhead::lh_lock ++ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL ++ */ ++static void ++htree_node_unlock_internal(struct htree_lock_head *lhead, ++ struct htree_lock *curlk, unsigned dep, void *event) ++{ ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ struct htree_lock *grtlk = NULL; ++ struct htree_lock_node *grtln; ++ struct htree_lock *poslk; ++ struct htree_lock *tmplk; ++ ++ if (!htree_node_is_granted(curlk, dep)) ++ return; ++ ++ if (!list_empty(&curln->ln_granted_list)) { ++ /* there is another granted lock */ ++ grtlk = list_entry(curln->ln_granted_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_granted_list); ++ list_del_init(&curln->ln_granted_list); ++ } ++ ++ if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) { ++ /* ++ * @curlk is the only granted lock, so we confirmed: ++ * a) curln is key owner (attached on major/minor_list), ++ * so if there is any blocked lock, it should be attached ++ * on curln->ln_blocked_list ++ * b) we always can grant the first blocked lock ++ */ ++ grtlk = list_entry(curln->ln_blocked_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_blocked_list); ++ BUG_ON(grtlk->lk_task == NULL); ++ wake_up_process(grtlk->lk_task); ++ } ++ ++ if (event != NULL && ++ lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) { ++ curln->ln_ev_target = event; ++ curln->ln_mode = HTREE_LOCK_NL; /* listen! */ ++ } else { ++ curln->ln_mode = HTREE_LOCK_INVAL; ++ } ++ ++ if (grtlk == NULL) { /* I must be the only one locking this key */ ++ struct htree_lock_node *tmpln; ++ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (curln->ln_mode == HTREE_LOCK_NL) /* listening */ ++ return; ++ ++ /* not listening */ ++ if (list_empty(&curln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(curln); ++ return; ++ } ++ ++ tmpln = list_entry(curln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL); ++ ++ htree_key_list_replace_init(curln, tmpln); ++ list_del_init(&curln->ln_alive_list); ++ ++ return; ++ } ++ ++ /* have a granted lock */ ++ grtln = &grtlk->lk_nodes[dep]; ++ if (!list_empty(&curln->ln_blocked_list)) { ++ /* only key owner can be on both lists */ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (list_empty(&grtln->ln_blocked_list)) { ++ list_add(&grtln->ln_blocked_list, ++ &curln->ln_blocked_list); ++ } ++ list_del_init(&curln->ln_blocked_list); ++ } ++ /* ++ * NB: this is the tricky part: ++ * We have only two modes for child-lock (PR and PW), also, ++ * only owner of the key (attached on major/minor_list) can be on ++ * both blocked_list and granted_list, so @grtlk must be one ++ * of these two cases: ++ * ++ * a) @grtlk is taken from granted_list, which means we've granted ++ * more than one lock so @grtlk has to be PR, the first blocked ++ * lock must be PW and we can't grant it at all. ++ * So even @grtlk is not owner of the key (empty blocked_list), ++ * we don't care because we can't grant any lock. ++ * b) we just grant a new lock which is taken from head of blocked ++ * list, and it should be the first granted lock, and it should ++ * be the first one linked on blocked_list. ++ * ++ * Either way, we can get correct result by iterating blocked_list ++ * of @grtlk, and don't have to bother on how to find out ++ * owner of current key. ++ */ ++ list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list, ++ lk_nodes[dep].ln_blocked_list) { ++ if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW || ++ poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW) ++ break; ++ /* grant all readers */ ++ list_del_init(&poslk->lk_nodes[dep].ln_blocked_list); ++ list_add(&poslk->lk_nodes[dep].ln_granted_list, ++ &grtln->ln_granted_list); ++ ++ BUG_ON(poslk->lk_task == NULL); ++ wake_up_process(poslk->lk_task); ++ } ++ ++ /* if @curln is the owner of this key, replace it with @grtln */ ++ if (!htree_key_list_empty(curln)) ++ htree_key_list_replace_init(curln, grtln); ++ ++ if (curln->ln_mode == HTREE_LOCK_INVAL) ++ list_del_init(&curln->ln_alive_list); ++} ++ ++/* ++ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted ++ * and 0 only if @wait is false and can't grant it immediately ++ */ ++int ++htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event); ++ if (rc != 0) ++ htree_spin_unlock(lhead, dep); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_node_lock_try); ++ ++/* it's wrapper of htree_node_unlock_internal */ ++void ++htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ htree_node_unlock_internal(lhead, lck, dep, event); ++ htree_spin_unlock(lhead, dep); ++} ++EXPORT_SYMBOL(htree_node_unlock); ++ ++/* stop listening on child-lock level @dep */ ++void ++htree_node_stop_listen(struct htree_lock *lck, unsigned dep) ++{ ++ struct htree_lock_node *ln = &lck->lk_nodes[dep]; ++ struct htree_lock_node *tmp; ++ ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ BUG_ON(!list_empty(&ln->ln_blocked_list)); ++ BUG_ON(!list_empty(&ln->ln_granted_list)); ++ ++ if (!htree_node_is_listening(lck, dep)) ++ return; ++ ++ htree_spin_lock(lck->lk_head, dep); ++ ln->ln_mode = HTREE_LOCK_INVAL; ++ ln->ln_ev_target = NULL; ++ ++ if (htree_key_list_empty(ln)) { /* not owner */ ++ list_del_init(&ln->ln_alive_list); ++ goto out; ++ } ++ ++ /* I'm the owner... */ ++ if (list_empty(&ln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(ln); ++ goto out; ++ } ++ ++ tmp = list_entry(ln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmp->ln_mode != HTREE_LOCK_NL); ++ htree_key_list_replace_init(ln, tmp); ++ list_del_init(&ln->ln_alive_list); ++ out: ++ htree_spin_unlock(lck->lk_head, dep); ++} ++EXPORT_SYMBOL(htree_node_stop_listen); ++ ++/* release all child-locks if we have any */ ++static void ++htree_node_release_all(struct htree_lock *lck) ++{ ++ int i; ++ ++ for (i = 0; i < lck->lk_depth; i++) { ++ if (htree_node_is_granted(lck, i)) ++ htree_node_unlock(lck, i, NULL); ++ else if (htree_node_is_listening(lck, i)) ++ htree_node_stop_listen(lck, i); ++ } ++} ++ ++/* ++ * obtain htree lock, it could be blocked inside if there's conflict ++ * with any granted or blocked lock and @wait is true. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_lock_internal(struct htree_lock *lck, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int granted = 0; ++ int blocked = 0; ++ int i; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ if (lhead->lh_nblocked[i] != 0) ++ blocked |= 1 << i; ++ } ++ if ((htree_lock_compat[lck->lk_mode] & granted) != granted || ++ (htree_lock_compat[lck->lk_mode] & blocked) != blocked) { ++ /* will block current lock even it just conflicts with any ++ * other blocked lock, so lock like EX wouldn't starve */ ++ if (!wait) ++ return -1; ++ lhead->lh_nblocked[lck->lk_mode]++; ++ lk_block_inc(lck->lk_mode); ++ ++ lck->lk_task = current; ++ list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ /* wait to be given the lock */ ++ if (lck->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt. wake up will set me RUNNING */ ++ return 0; /* without lh_lock */ ++ } ++ lhead->lh_ngranted[lck->lk_mode]++; ++ lk_grant_inc(lck->lk_mode); ++ return 1; ++} ++ ++/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */ ++static void ++htree_unlock_internal(struct htree_lock *lck) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ int granted = 0; ++ int i; ++ ++ BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0); ++ ++ lhead->lh_ngranted[lck->lk_mode]--; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ } ++ list_for_each_entry_safe(tmp, tmp2, ++ &lhead->lh_blocked_list, lk_blocked_list) { ++ /* conflict with any granted lock? */ ++ if ((htree_lock_compat[tmp->lk_mode] & granted) != granted) ++ break; ++ ++ list_del_init(&tmp->lk_blocked_list); ++ ++ BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0); ++ ++ lhead->lh_nblocked[tmp->lk_mode]--; ++ lhead->lh_ngranted[tmp->lk_mode]++; ++ granted |= 1 << tmp->lk_mode; ++ ++ BUG_ON(tmp->lk_task == NULL); ++ wake_up_process(tmp->lk_task); ++ } ++} ++ ++/* it's wrapper of htree_lock_internal and exported interface. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait) ++{ ++ int rc; ++ ++ BUG_ON(lck->lk_depth > lhead->lh_depth); ++ BUG_ON(lck->lk_head != NULL); ++ BUG_ON(lck->lk_task != NULL); ++ ++ lck->lk_head = lhead; ++ lck->lk_mode = mode; ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_lock_try); ++ ++/* it's wrapper of htree_unlock_internal and exported interface. ++ * It will release all htree_node_locks and htree_lock */ ++void ++htree_unlock(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_head == NULL); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT); ++ lck->lk_head = NULL; ++ lck->lk_task = NULL; ++} ++EXPORT_SYMBOL(htree_unlock); ++ ++/* change lock mode */ ++void ++htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode) ++{ ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ lck->lk_mode = mode; ++} ++EXPORT_SYMBOL(htree_change_mode); ++ ++/* release htree lock, and lock it again with new mode. ++ * This function will first release all htree_node_locks and htree_lock, ++ * then try to gain htree_lock with new @mode. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(lhead == NULL); ++ BUG_ON(lck->lk_mode == mode); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ lck->lk_mode = mode; ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_change_lock_try); ++ ++/* create a htree_lock head with @depth levels (number of child-locks), ++ * it is a per resoruce structure */ ++struct htree_lock_head * ++htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv) ++{ ++ struct htree_lock_head *lhead; ++ int i; ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ ++ lhead = kzalloc(offsetof(struct htree_lock_head, ++ lh_children[depth]) + priv, GFP_NOFS); ++ if (lhead == NULL) ++ return NULL; ++ ++ if (hbits < HTREE_HBITS_MIN) ++ lhead->lh_hbits = HTREE_HBITS_MIN; ++ else if (hbits > HTREE_HBITS_MAX) ++ lhead->lh_hbits = HTREE_HBITS_MAX; ++ ++ lhead->lh_lock = 0; ++ lhead->lh_depth = depth; ++ INIT_LIST_HEAD(&lhead->lh_blocked_list); ++ if (priv > 0) { ++ lhead->lh_private = (void *)lhead + ++ offsetof(struct htree_lock_head, lh_children[depth]); ++ } ++ ++ for (i = 0; i < depth; i++) { ++ INIT_LIST_HEAD(&lhead->lh_children[i].lc_list); ++ lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE; ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(htree_lock_head_alloc); ++ ++/* free the htree_lock head */ ++void ++htree_lock_head_free(struct htree_lock_head *lhead) ++{ ++ int i; ++ ++ BUG_ON(!list_empty(&lhead->lh_blocked_list)); ++ for (i = 0; i < lhead->lh_depth; i++) ++ BUG_ON(!list_empty(&lhead->lh_children[i].lc_list)); ++ kfree(lhead); ++} ++EXPORT_SYMBOL(htree_lock_head_free); ++ ++/* register event callback for @events of child-lock at level @dep */ ++void ++htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep, ++ unsigned events, htree_event_cb_t callback) ++{ ++ BUG_ON(lhead->lh_depth <= dep); ++ lhead->lh_children[dep].lc_events = events; ++ lhead->lh_children[dep].lc_callback = callback; ++} ++EXPORT_SYMBOL(htree_lock_event_attach); ++ ++/* allocate a htree_lock, which is per-thread structure, @pbytes is some ++ * extra-bytes as private data for caller */ ++struct htree_lock * ++htree_lock_alloc(unsigned depth, unsigned pbytes) ++{ ++ struct htree_lock *lck; ++ int i = offsetof(struct htree_lock, lk_nodes[depth]); ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ lck = kzalloc(i + pbytes, GFP_NOFS); ++ if (lck == NULL) ++ return NULL; ++ ++ if (pbytes != 0) ++ lck->lk_private = (void *)lck + i; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ lck->lk_depth = depth; ++ INIT_LIST_HEAD(&lck->lk_blocked_list); ++ ++ for (i = 0; i < depth; i++) { ++ struct htree_lock_node *node = &lck->lk_nodes[i]; ++ ++ node->ln_mode = HTREE_LOCK_INVAL; ++ INIT_LIST_HEAD(&node->ln_major_list); ++ INIT_LIST_HEAD(&node->ln_minor_list); ++ INIT_LIST_HEAD(&node->ln_alive_list); ++ INIT_LIST_HEAD(&node->ln_blocked_list); ++ INIT_LIST_HEAD(&node->ln_granted_list); ++ } ++ ++ return lck; ++} ++EXPORT_SYMBOL(htree_lock_alloc); ++ ++/* free htree_lock node */ ++void ++htree_lock_free(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL); ++ kfree(lck); ++} ++EXPORT_SYMBOL(htree_lock_free); +--- linux-2.6.32-131.6.1/fs/ext4/ext4.h 2011-10-06 20:10:49.000000000 +0800 ++++ linux-2.6.32-131.6.1-pdo/fs/ext4/ext4.h 2011-12-08 18:25:00.000000000 +0800 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include + #include + #ifdef __KERNEL__ +@@ -1277,6 +1278,7 @@ EXT4_INODE_BIT_FNS(state, state_flags) + #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 + #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 + #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 ++#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 + + #define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ +@@ -1286,7 +1288,8 @@ EXT4_INODE_BIT_FNS(state, state_flags) + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_MMP| \ +- EXT4_FEATURE_INCOMPAT_DIRDATA) ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ ++ EXT4_FEATURE_INCOMPAT_LARGEDIR) + + #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -1536,6 +1539,76 @@ ext4_group_first_block_no(struct super_b + */ + #define ERR_BAD_DX_DIR -75000 + ++/* htree levels for ext4 */ ++#define EXT4_HTREE_LEVEL_COMPAT 2 ++#define EXT4_HTREE_LEVEL 3 ++ ++static inline int ++ext4_dir_htree_level(struct super_block *sb) ++{ ++ return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ? ++ EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; ++} ++ ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ + void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); + +@@ -1769,14 +1842,16 @@ extern int ext4_htree_fill_tree(struct f + extern struct inode *ext4_create_inode(handle_t *handle, + struct inode * dir, int mode); + extern int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode); ++ struct inode *inode, struct htree_lock *lck); + extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 * de_del, + struct buffer_head * bh); + extern struct buffer_head * ext4_find_entry(struct inode *dir, + const struct qstr *d_name, +- struct ext4_dir_entry_2 ** res_dir); +-#define ll_ext4_find_entry(inode, dentry, res_dir) ext4_find_entry(inode, &(dentry)->d_name, res_dir) ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck); ++#define ll_ext4_find_entry(inode, dentry, res_dir, lck) \ ++ ext4_find_entry(inode, &(dentry)->d_name, res_dir, lck) + extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, + struct inode *inode, const void *, const void *); + extern struct buffer_head *ext4_append(handle_t *handle, +@@ -1893,13 +1968,15 @@ static inline void ext4_r_blocks_count_s + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + +-static inline loff_t ext4_isize(struct ext4_inode *raw_inode) ++static inline loff_t ext4_isize(struct super_block *sb, ++ struct ext4_inode *raw_inode) + { +- if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) || ++ S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); +- else +- return (loff_t) le32_to_cpu(raw_inode->i_size_lo); ++ ++ return (loff_t) le32_to_cpu(raw_inode->i_size_lo); + } + + static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +--- linux-2.6.32-131.6.1/fs/ext4/namei.c 2011-10-06 20:10:49.000000000 +0800 ++++ linux-2.6.32-131.6.1-pdo/fs/ext4/namei.c 2011-12-14 22:55:28.000000000 +0800 +@@ -176,7 +176,7 @@ static struct dx_frame *dx_probe(const s + struct inode *dir, + struct dx_hash_info *hinfo, + struct dx_frame *frame, +- int *err); ++ struct htree_lock *lck, int *err); + static void dx_release(struct dx_frame *frames); + static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); +@@ -189,13 +189,13 @@ static void dx_insert_block(struct dx_fr + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *err); ++ struct htree_lock *lck, int *err); + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode); ++ struct inode *inode, struct htree_lock *lck); + + /* + * p is at least 6 bytes before the end of page +@@ -225,7 +225,7 @@ struct dx_root_info * dx_get_dx_info(str + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +- return le32_to_cpu(entry->block) & 0x00ffffff; ++ return le32_to_cpu(entry->block) & 0x0fffffff; + } + + static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) +@@ -368,6 +368,223 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -379,16 +596,17 @@ struct stats dx_show_entries(struct dx_h + */ + static struct dx_frame * + dx_probe(const struct qstr *d_name, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck, int *err) + { + unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info * info; + struct buffer_head *bh; + struct dx_frame *frame = frame_in; + u32 hash; + +- frame->bh = NULL; ++ memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); + if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) + goto fail; + +@@ -418,9 +636,16 @@ dx_probe(const struct qstr *d_name, stru + goto fail; + } + +- if ((indirect = info->indirect_levels) > 1) { +- ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", +- info->indirect_levels); ++ indirect = info->indirect_levels; ++ if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ++ ext4_warning(dir->i_sb, ++ "Directory (ino: %lu) htree depth %#06x exceed " ++ "supported value", dir->i_ino, ++ ext4_dir_htree_level(dir->i_sb)); ++ if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(dir->i_sb, "Enable large directory " ++ "feature to access it"); ++ } + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; +@@ -440,8 +665,15 @@ dx_probe(const struct qstr *d_name, stru + dxtrace(printk("Look up %x", hash)); + while (1) + { ++ if (indirect == 0) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); +- if (!count || count > dx_get_limit(entries)) { ++ if (count == 0 || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning(dir->i_sb, + "dx entry: no count or count > limit"); + brelse(bh); +@@ -482,9 +714,73 @@ dx_probe(const struct qstr *d_name, stru + frame->bh = bh; + frame->entries = entries; + frame->at = at; +- if (!indirect--) return frame; ++ ++ if (indirect == 0) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } ++ return frame; ++ } ++ dx = at; ++ indirect--; + if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err))) + goto fail2; ++ + at = entries = ((struct dx_node *) bh->b_data)->entries; + if (dx_get_limit(entries) != dx_node_limit (dir)) { + ext4_warning(dir->i_sb, +@@ -512,13 +808,18 @@ fail: + static void dx_release (struct dx_frame *frames) + { + struct dx_root_info *info; ++ int i; ++ + if (frames[0].bh == NULL) + return; + + info = dx_get_dx_info((struct ext4_dir_entry_2*)frames[0].bh->b_data); +- if (info->indirect_levels) +- brelse(frames[1].bh); +- brelse(frames[0].bh); ++ for (i = 0; i <= info->indirect_levels; i++) { ++ if (frames[i].bh == NULL) ++ break; ++ brelse(frames[i].bh); ++ frames[i].bh = NULL; ++ } + } + + /* +@@ -541,7 +842,7 @@ static void dx_release (struct dx_frame + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -556,12 +857,22 @@ static int ext4_htree_next_block(struct + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -584,6 +895,13 @@ static int ext4_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at), + 0, &err))) + return err; /* Failure */ +@@ -592,6 +910,7 @@ static int ext4_htree_next_block(struct + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -661,7 +980,7 @@ int ext4_htree_fill_tree(struct file *di + { + struct dx_hash_info hinfo; + struct ext4_dir_entry_2 *de; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct inode *dir; + ext4_lblk_t block; + int count = 0; +@@ -684,10 +1003,10 @@ int ext4_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames, &err); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL, &err); + if (!frame) + return err; +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -714,7 +1033,7 @@ int ext4_htree_fill_tree(struct file *di + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -814,9 +1133,17 @@ static void dx_insert_block(struct dx_fr + + static void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); ++#endif + } + + /* +@@ -889,8 +1216,9 @@ static inline int search_dirblock(struct + * to brelse() it when appropriate. + */ + struct buffer_head * ext4_find_entry(struct inode *dir, +- const struct qstr *d_name, +- struct ext4_dir_entry_2 ** res_dir) ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -911,7 +1239,7 @@ struct buffer_head * ext4_find_entry(str + if (namelen > EXT4_NAME_LEN) + return NULL; + if (is_dx(dir)) { +- bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); ++ bh = ext4_dx_find_entry(dir, d_name, res_dir, lck, &err); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -921,6 +1249,7 @@ struct buffer_head * ext4_find_entry(str + return bh; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + start = EXT4_I(dir)->i_dir_start_lookup; +@@ -998,13 +1327,15 @@ cleanup_and_exit: + } + EXPORT_SYMBOL(ext4_find_entry); + +-static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, +- struct ext4_dir_entry_2 **res_dir, int *err) ++static struct buffer_head * ext4_dx_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck, int *err) + { + struct super_block * sb; + struct dx_hash_info hinfo; + u32 hash; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct ext4_dir_entry_2 *de, *top; + struct buffer_head *bh; + ext4_lblk_t block; +@@ -1015,13 +1346,16 @@ static struct buffer_head * ext4_dx_find + sb = dir->i_sb; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ +- if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) ++ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, lck, err))) + return NULL; + } else { + frame = frames; + frame->bh = NULL; /* for dx_release() */ + frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ + dx_set_block(frame->at, 0); /* dx_root block is 0 */ ++ /* "." and ".." are stored in root DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, NULL); + } + hash = hinfo.hash; + do { +@@ -1050,7 +1384,7 @@ static struct buffer_head * ext4_dx_find + brelse(bh); + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning(sb, + "error reading index page in directory #%lu", +@@ -1076,7 +1410,7 @@ static struct dentry *ext4_lookup(struct + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + +- bh = ext4_find_entry(dir, &dentry->d_name, &de); ++ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + inode = NULL; + if (bh) { + __u32 ino = le32_to_cpu(de->inode); +@@ -1144,7 +1478,7 @@ struct dentry *ext4_get_parent(struct de + struct ext4_dir_entry_2 * de; + struct buffer_head *bh; + +- bh = ext4_find_entry(child->d_inode, &dotdot, &de); ++ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + inode = NULL; + if (!bh) + return ERR_PTR(-ENOENT); +@@ -1233,8 +1567,9 @@ static struct ext4_dir_entry_2* dx_pack_ + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo, int *error) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck, int *error) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1291,7 +1626,14 @@ static struct ext4_dir_entry_2 *do_split + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); ++ } + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de, + blocksize); +@@ -1300,13 +1642,21 @@ static struct ext4_dir_entry_2 *do_split + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); + dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) +- { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_metadata(handle, dir, bh2); + if (err) + goto journal_error; +@@ -1418,7 +1768,7 @@ static int add_dirent_to_buf(handle_t *h + if (!IS_NOCMTIME(dir)) + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); +- dir->i_version++; ++ inode_inc_iversion(dir); + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); +@@ -1438,7 +1788,7 @@ static int make_indexed_dir(handle_t *ha + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct buffer_head *bh2; +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; + struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + char *data1, *top; +@@ -1517,7 +1867,7 @@ static int make_indexed_dir(handle_t *ha + frame->at = entries; + frame->bh = bh; + bh = bh2; +- de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ de = do_split(handle,dir, &bh, frames, frame, &hinfo, NULL, &retval); + dx_release (frames); + if (!(de)) + return retval; +@@ -1616,7 +1966,7 @@ out: + * the entry, as someone else might have used it while you slept. + */ + int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = dentry->d_parent->d_inode; + struct buffer_head *bh; +@@ -1635,9 +1985,10 @@ int ext4_add_entry(handle_t *handle, str + if (dentry->d_name.len == 2 && + memcmp(dentry->d_name.name, "..", 2) == 0) + return ext4_update_dotdot(handle, dentry, inode); +- retval = ext4_dx_add_entry(handle, dentry, inode); ++ retval = ext4_dx_add_entry(handle, dentry, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; ++ ext4_htree_safe_relock(lck); + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); +@@ -1674,18 +2025,21 @@ EXPORT_SYMBOL(ext4_add_entry); + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++ struct inode *inode, struct htree_lock *lck) + { +- struct dx_frame frames[2], *frame; ++ struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; + struct dx_hash_info hinfo; + struct buffer_head *bh; + struct inode *dir = dentry->d_parent->d_inode; + struct super_block *sb = dir->i_sb; + struct ext4_dir_entry_2 *de; ++ int restart; + int err; + +- frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); ++again: ++ restart = 0; ++ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, lck, &err); + if (!frame) + return err; + entries = frame->entries; +@@ -1694,33 +2048,53 @@ static int ext4_dx_add_entry(handle_t *h + if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err))) + goto cleanup; + +- BUFFER_TRACE(bh, "get_write_access"); +- err = ext4_journal_get_write_access(handle, bh); +- if (err) +- goto journal_error; +- + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (err != -ENOSPC) + goto cleanup; + ++ err = 0; + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +- unsigned icount = dx_get_count(entries); +- int levels = frame - frames; ++ int levels = frame - frames + 1; ++ unsigned icount; ++ int add_level = 1; + struct dx_entry *entries2; + struct dx_node *node2; + struct buffer_head *bh2; + +- if (levels && (dx_get_count(frames->entries) == +- dx_get_limit(frames->entries))) { +- ext4_warning(sb, "Directory index full!"); ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } ++ while (frame > frames) { ++ if (dx_get_count((frame - 1)->entries) < ++ dx_get_limit((frame - 1)->entries)) { ++ add_level = 0; ++ break; ++ } ++ frame--; /* split higher index block */ ++ at = frame->at; ++ entries = frame->entries; ++ restart = 1; ++ } ++ if (add_level && levels == ext4_dir_htree_level(sb)) { ++ ext4_warning(sb, "Directory (ino: %lu) index full, " ++ "reach max htree level :%d", ++ dir->i_ino, levels); ++ if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { ++ ext4_warning(sb, "Large directory feature is" ++ "not enabled on this " ++ "filesystem"); ++ } + err = -ENOSPC; + goto cleanup; + } ++ icount = dx_get_count(entries); + bh2 = ext4_append (handle, dir, &newblock, &err); + if (!(bh2)) + goto cleanup; +@@ -1733,7 +2107,7 @@ static int ext4_dx_add_entry(handle_t *h + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) + goto journal_error; +- if (levels) { ++ if (!add_level) { + unsigned icount1 = icount/2, icount2 = icount - icount1; + unsigned hash2 = dx_get_hash(entries + icount1); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", +@@ -1741,7 +2115,7 @@ static int ext4_dx_add_entry(handle_t *h + + BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ + err = ext4_journal_get_write_access(handle, +- frames[0].bh); ++ (frame - 1)->bh); + if (err) + goto journal_error; + +@@ -1757,18 +2131,24 @@ static int ext4_dx_add_entry(handle_t *h + frame->entries = entries = entries2; + swap(frame->bh, bh2); + } +- dx_insert_block(frames + 0, hash2, newblock); +- dxtrace(dx_show_index("node", frames[1].entries)); ++ dx_insert_block((frame - 1), hash2, newblock); ++ dxtrace(dx_show_index("node", frame->entries)); + dxtrace(dx_show_index("node", + ((struct dx_node *) bh2->b_data)->entries)); + err = ext4_handle_dirty_metadata(handle, inode, bh2); + if (err) + goto journal_error; + brelse (bh2); ++ ext4_handle_dirty_metadata(handle, inode, ++ (frame - 1)->bh); ++ if (restart) { ++ ext4_handle_dirty_metadata(handle, inode, ++ frame->bh); ++ goto cleanup; ++ } + } else { + struct dx_root_info * info; +- dxtrace(printk(KERN_DEBUG +- "Creating second level index...\n")); ++ + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); +@@ -1778,32 +2158,60 @@ static int ext4_dx_add_entry(handle_t *h + dx_set_block(entries + 0, newblock); + info = dx_get_dx_info((struct ext4_dir_entry_2*) + frames[0].bh->b_data); +- info->indirect_levels = 1; ++ info->indirect_levels += 1; ++ dxtrace(printk(KERN_DEBUG ++ "Creating %d level index...\n", ++ info->indirect_levels)); ++ ext4_handle_dirty_metadata(handle, inode, frame->bh); ++ ext4_handle_dirty_metadata(handle, inode, bh2); ++ brelse(bh2); ++ restart = 1; ++ goto cleanup; ++ } ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); + +- /* Add new access path frame */ +- frame = frames + 1; +- frame->at = at = at - entries + entries2; +- frame->entries = entries = entries2; +- frame->bh = bh2; +- err = ext4_journal_get_write_access(handle, +- frame->bh); +- if (err) +- goto journal_error; ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; ++ goto cleanup; + } +- ext4_handle_dirty_metadata(handle, inode, frames[0].bh); ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; + } +- de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ de = do_split(handle, dir, &bh, frames, frame, &hinfo, lck, &err); + if (!de) + goto cleanup; ++ + err = add_dirent_to_buf(handle, dentry, inode, de, bh); + goto cleanup; + + journal_error: + ext4_std_error(dir->i_sb, err); + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + if (bh) + brelse(bh); + dx_release(frames); ++ /* @restart is true means htree-path has been changed, we need to ++ * repeat dx_probe() to find out valid htree-path */ ++ if (restart && err == 0) ++ goto again; + return err; + } + +@@ -1838,7 +2246,7 @@ int ext4_delete_entry(handle_t *handle, + blocksize); + else + de->inode = 0; +- dir->i_version++; ++ inode_inc_iversion(dir); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, bh); + return 0; +@@ -1882,7 +2290,7 @@ static void ext4_dec_count(handle_t *han + static int ext4_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) + { +- int err = ext4_add_entry(handle, dentry, inode); ++ int err = ext4_add_entry(handle, dentry, inode, NULL); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); +@@ -2112,7 +2520,7 @@ retry: + goto out_stop; + } + +- err = ext4_add_entry(handle, dentry, inode); ++ err = ext4_add_entry(handle, dentry, inode, NULL); + if (err) { + clear_nlink(inode); + unlock_new_inode(inode); +@@ -2381,7 +2789,7 @@ static int ext4_rmdir(struct inode *dir, + return PTR_ERR(handle); + + retval = -ENOENT; +- bh = ext4_find_entry(dir, &dentry->d_name, &de); ++ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (!bh) + goto end_rmdir; + +@@ -2443,7 +2851,7 @@ static int ext4_unlink(struct inode *dir + ext4_handle_sync(handle); + + retval = -ENOENT; +- bh = ext4_find_entry(dir, &dentry->d_name, &de); ++ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (!bh) + goto end_unlink; + +@@ -2567,7 +2975,7 @@ retry: + ext4_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext4_add_entry(handle, dentry, inode); ++ err = ext4_add_entry(handle, dentry, inode, NULL); + if (!err) { + ext4_mark_inode_dirty(handle, inode); + d_instantiate(dentry, inode); +@@ -2612,7 +3020,7 @@ static int ext4_rename(struct inode *old + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + ext4_handle_sync(handle); + +- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); ++ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process +@@ -2625,7 +3033,7 @@ static int ext4_rename(struct inode *old + goto end_rename; + + new_inode = new_dentry->d_inode; +- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); ++ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de, NULL); + if (new_bh) { + if (!new_inode) { + brelse(new_bh); +@@ -2651,7 +3059,7 @@ static int ext4_rename(struct inode *old + goto end_rename; + } + if (!new_bh) { +- retval = ext4_add_entry(handle, new_dentry, old_inode); ++ retval = ext4_add_entry(handle, new_dentry, old_inode, NULL); + if (retval) + goto end_rename; + } else { +@@ -2693,7 +3101,8 @@ static int ext4_rename(struct inode *old + struct buffer_head *old_bh2; + struct ext4_dir_entry_2 *old_de2; + +- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); ++ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, ++ &old_de2, NULL); + if (old_bh2) { + retval = ext4_delete_entry(handle, old_dir, + old_de2, old_bh2); +--- linux-2.6.32-131.6.1/fs/ext4/inode.c 2011-10-06 20:10:49.000000000 +0800 ++++ linux-2.6.32-131.6.1-pdo/fs/ext4/inode.c 2011-12-01 22:02:11.000000000 +0800 +@@ -5112,7 +5112,7 @@ struct inode *ext4_iget(struct super_blo + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + ei->i_file_acl |= + ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; +- inode->i_size = ext4_isize(raw_inode); ++ inode->i_size = ext4_isize(sb, raw_inode); + ei->i_disksize = inode->i_size; + #ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +--- linux-2.6.32-131.6.1/fs/ext4/Makefile 2011-10-06 20:10:49.000000000 +0800 ++++ linux-2.6.32-131.6.1-pdo/fs/ext4/Makefile 2011-10-06 12:21:30.000000000 +0800 +@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ +- mmp.o dynlocks.o ++ htree_lock.o mmp.o dynlocks.o + + ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o + ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series index 4b4318b..8b6a5a0 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series @@ -33,3 +33,4 @@ ext4-export-64bit-name-hash.patch ext4-vmalloc-rhel6.patch ext4-journal-callback.patch ext4-store-tree-generation-at-find.patch +ext4_pdirop-rhel6.patch diff --git a/ldiskfs/ldiskfs/Makefile.in b/ldiskfs/ldiskfs/Makefile.in index 5abd246..acbd133 100644 --- a/ldiskfs/ldiskfs/Makefile.in +++ b/ldiskfs/ldiskfs/Makefile.in @@ -7,6 +7,8 @@ backfs_extra := $(wildcard @LINUX@/fs/@BACKFS@/Makefile) backfs_headers := $(wildcard @EXT_DIR@/*.h) linux_headers := $(wildcard @LINUX@/include/linux/@BACKFS@*.h) +linux_new_headers := dynlocks.h +@LDISKFS_PDO_TRUE@linux_new_headers += htree_lock.h trace_headers := $(wildcard @LINUX@/include/trace/events/@BACKFS@*.h) backfs_sources := $(filter-out %.mod.c,$(wildcard @EXT_DIR@/*.c)) @@ -15,6 +17,7 @@ ext3_new_sources := extents.c mballoc.c group.h dynlocks.c fiemap.h ext3_new_headers := ext3_extents.h ext4_new_sources := dynlocks.c fiemap.h mmp.c +@LDISKFS_PDO_TRUE@ext4_new_sources += htree_lock.c ext4_new_headers := new_sources := $(@BACKFS@_new_sources) diff --git a/ldiskfs/ldiskfs/autoMakefile.am b/ldiskfs/ldiskfs/autoMakefile.am index d6460a2..a4dda14 100644 --- a/ldiskfs/ldiskfs/autoMakefile.am +++ b/ldiskfs/ldiskfs/autoMakefile.am @@ -72,10 +72,12 @@ endif linux-stage/include/trace/events/@BACKFS@$$i \ > trace/events/ldiskfs$$i ; \ done - sed $(strip $(ldiskfs_sed_flags)) \ - linux-stage/include/linux/dynlocks.h \ - > linux/dynlocks.h - + for i in $(notdir $(linux_new_headers)) ; do \ + echo -n " $$i"; \ + sed $(strip $(ldiskfs_sed_flags)) \ + linux-stage/include/linux/$$i \ + > linux/$$i ; \ + done @echo touch sources @@ -89,7 +91,7 @@ foo-check: @echo "ldiskfs_LDADD: $(ldiskfs_LDADD)" MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -CLEANFILES = sources $(notdir $(linux_headers) $(backfs_headers) $(backfs_sources) $(new_sources) $(new_headers) $(trace_headers)) +CLEANFILES = sources $(notdir $(linux_new_headers) $(linux_headers) $(backfs_headers) $(backfs_sources) $(new_sources) $(new_headers) $(trace_headers)) clean: clean-am rm -rf linux linux-stage ldiskfs*.h trace diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 9e4f75b..788eb31 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -82,6 +82,14 @@ /* llo_* api support */ #include +#ifdef HAVE_LDISKFS_PDO +int ldiskfs_pdo = 1; +CFS_MODULE_PARM(ldiskfs_pdo, "i", int, 0644, + "ldiskfs with parallel directory operations"); +#else +int ldiskfs_pdo = 0; +#endif + static const char dot[] = "."; static const char dotdot[] = ".."; static const char remote_obj_dir[] = "REM_OBJ_DIR"; @@ -104,6 +112,7 @@ struct osd_object { /** * to protect index ops. */ + struct htree_lock_head *oo_hl_head; cfs_rw_semaphore_t oo_ext_idx_sem; cfs_rw_semaphore_t oo_sem; struct osd_directory *oo_dir; @@ -315,8 +324,9 @@ static struct lu_object *osd_object_alloc(const struct lu_env *env, cfs_init_rwsem(&mo->oo_ext_idx_sem); cfs_spin_lock_init(&mo->oo_guard); return l; - } else + } else { return NULL; + } } /* @@ -398,27 +408,43 @@ static int osd_fid_lookup(const struct lu_env *env, RETURN(-ENOENT); result = osd_oi_lookup(info, oi, fid, id); - if (result == 0) { - inode = osd_iget(info, dev, id); - if (!IS_ERR(inode)) { - obj->oo_inode = inode; - LASSERT(obj->oo_inode->i_sb == osd_sb(dev)); - if (dev->od_iop_mode) { - obj->oo_compat_dot_created = 1; - obj->oo_compat_dotdot_created = 1; - } + if (result != 0) { + if (result == -ENOENT) result = 0; - } else - /* - * If fid wasn't found in oi, inode-less object is - * created, for which lu_object_exists() returns - * false. This is used in a (frequent) case when - * objects are created as locking anchors or - * place holders for objects yet to be created. - */ - result = PTR_ERR(inode); - } else if (result == -ENOENT) - result = 0; + goto out; + } + + inode = osd_iget(info, dev, id); + if (IS_ERR(inode)) { + /* + * If fid wasn't found in oi, inode-less object is + * created, for which lu_object_exists() returns + * false. This is used in a (frequent) case when + * objects are created as locking anchors or + * place holders for objects yet to be created. + */ + result = PTR_ERR(inode); + goto out; + } + + obj->oo_inode = inode; + LASSERT(obj->oo_inode->i_sb == osd_sb(dev)); + if (dev->od_iop_mode) { + obj->oo_compat_dot_created = 1; + obj->oo_compat_dotdot_created = 1; + } + + if (!S_ISDIR(inode->i_mode) || !ldiskfs_pdo) /* done */ + goto out; + + LASSERT(obj->oo_hl_head == NULL); + obj->oo_hl_head = ldiskfs_htree_lock_head_alloc(HTREE_HBITS_DEF); + if (obj->oo_hl_head == NULL) { + obj->oo_inode = NULL; + iput(inode); + result = -ENOMEM; + } +out: LINVRNT(osd_invariant(obj)); RETURN(result); @@ -467,6 +493,8 @@ static void osd_object_free(const struct lu_env *env, struct lu_object *l) LINVRNT(osd_invariant(obj)); dt_object_fini(&obj->oo_dt); + if (obj->oo_hl_head != NULL) + ldiskfs_htree_lock_head_free(obj->oo_hl_head); OBD_FREE_PTR(obj); } @@ -1536,6 +1564,13 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, LINVRNT(osd_invariant(obj)); LASSERT(obj->oo_inode == NULL); + LASSERT(obj->oo_hl_head == NULL); + + if (S_ISDIR(mode) && ldiskfs_pdo) { + obj->oo_hl_head =ldiskfs_htree_lock_head_alloc(HTREE_HBITS_DEF); + if (obj->oo_hl_head == NULL) + return -ENOMEM; + } oth = container_of(th, struct osd_thandle, ot_super); LASSERT(oth->ot_handle->h_transaction != NULL); @@ -1563,8 +1598,13 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, inode->i_flags |= S_NOCMTIME; obj->oo_inode = inode; result = 0; - } else + } else { + if (obj->oo_hl_head != NULL) { + ldiskfs_htree_lock_head_free(obj->oo_hl_head); + obj->oo_hl_head = NULL; + } result = PTR_ERR(inode); + } LINVRNT(osd_invariant(obj)); return result; } @@ -2400,10 +2440,12 @@ static int osd_index_try(const struct lu_env *env, struct dt_object *dt, else result = 0; cfs_up_write(&obj->oo_ext_idx_sem); - } else + } else { result = -ENOMEM; - } else + } + } else { result = 0; + } if (result == 0 && ea_dir == 0) { if (!osd_iam_index_probe(env, obj, feat)) @@ -2771,6 +2813,7 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, struct osd_thandle *oh; struct ldiskfs_dir_entry_2 *de; struct buffer_head *bh; + struct htree_lock *hlock = NULL; int rc; @@ -2790,16 +2833,27 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, dentry = osd_child_dentry_get(env, obj, (char *)key, strlen((char *)key)); - cfs_down_write(&obj->oo_ext_idx_sem); - bh = ll_ldiskfs_find_entry(dir, dentry, &de); + if (obj->oo_hl_head != NULL) { + hlock = osd_oti_get(env)->oti_hlock; + ldiskfs_htree_lock(hlock, obj->oo_hl_head, + dir, LDISKFS_HLOCK_DEL); + } else { + cfs_down_write(&obj->oo_ext_idx_sem); + } + + bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock); if (bh) { rc = ldiskfs_delete_entry(oh->ot_handle, - dir, de, bh); + dir, de, bh); brelse(bh); - } else + } else { rc = -ENOENT; + } + if (hlock != NULL) + ldiskfs_htree_unlock(hlock); + else + cfs_up_write(&obj->oo_ext_idx_sem); - cfs_up_write(&obj->oo_ext_idx_sem); LASSERT(osd_invariant(obj)); RETURN(rc); } @@ -2940,6 +2994,7 @@ static int __osd_ea_add_rec(struct osd_thread_info *info, struct inode *cinode, const char *name, const struct dt_rec *fid, + struct htree_lock *hlock, struct thandle *th) { struct ldiskfs_dentry_param *ldp; @@ -2960,7 +3015,7 @@ static int __osd_ea_add_rec(struct osd_thread_info *info, child->d_fsdata = (void*) ldp; } else child->d_fsdata = NULL; - rc = ldiskfs_add_entry(oth->ot_handle, child, cinode); + rc = osd_ldiskfs_add_entry(oth->ot_handle, child, cinode, hlock); RETURN(rc); } @@ -3018,11 +3073,11 @@ static int osd_add_dot_dotdot(struct osd_thread_info *info, /* in case of rename, dotdot is already created */ if (dir->oo_compat_dotdot_created) { return __osd_ea_add_rec(info, dir, parent_dir, name, - dot_dot_fid, th); + dot_dot_fid, NULL, th); } - result = ldiskfs_add_dot_dotdot(oth->ot_handle, parent_dir, inode, - dot_ldp, dot_dot_ldp); + result = ldiskfs_add_dot_dotdot(oth->ot_handle, parent_dir, + inode, dot_ldp, dot_dot_ldp); if (result == 0) dir->oo_compat_dotdot_created = 1; } @@ -3043,15 +3098,37 @@ static int osd_ea_add_rec(const struct lu_env *env, struct thandle *th) { struct osd_thread_info *info = osd_oti_get(env); + struct htree_lock *hlock; int rc; + hlock = pobj->oo_hl_head != NULL ? info->oti_hlock : NULL; + if (name[0] == '.' && (name[1] == '\0' || (name[1] == '.' && - name[2] =='\0'))) + name[2] =='\0'))) { + if (hlock != NULL) { + ldiskfs_htree_lock(hlock, pobj->oo_hl_head, + pobj->oo_inode, 0); + } else { + cfs_down_write(&pobj->oo_ext_idx_sem); + } rc = osd_add_dot_dotdot(info, pobj, cinode, name, (struct dt_rec *)lu_object_fid(&pobj->oo_dt.do_lu), fid, th); + } else { + if (hlock != NULL) { + ldiskfs_htree_lock(hlock, pobj->oo_hl_head, + pobj->oo_inode, LDISKFS_HLOCK_ADD); + } else { + cfs_down_write(&pobj->oo_ext_idx_sem); + } + + rc = __osd_ea_add_rec(info, pobj, cinode, name, fid, + hlock, th); + } + if (hlock != NULL) + ldiskfs_htree_unlock(hlock); else - rc = __osd_ea_add_rec(info, pobj, cinode, name, fid, th); + cfs_up_write(&pobj->oo_ext_idx_sem); return rc; } @@ -3072,6 +3149,7 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, struct ldiskfs_dir_entry_2 *de; struct buffer_head *bh; struct lu_fid *fid = (struct lu_fid *) rec; + struct htree_lock *hlock = NULL; int ino; int rc; @@ -3080,8 +3158,15 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, dentry = osd_child_dentry_get(env, obj, (char *)key, strlen((char *)key)); - cfs_down_read(&obj->oo_ext_idx_sem); - bh = ll_ldiskfs_find_entry(dir, dentry, &de); + if (obj->oo_hl_head != NULL) { + hlock = osd_oti_get(env)->oti_hlock; + ldiskfs_htree_lock(hlock, obj->oo_hl_head, + dir, LDISKFS_HLOCK_LOOKUP); + } else { + cfs_down_read(&obj->oo_ext_idx_sem); + } + + bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock); if (bh) { ino = le32_to_cpu(de->inode); rc = osd_get_fid_from_dentry(de, rec); @@ -3090,10 +3175,14 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, brelse(bh); if (rc != 0) rc = osd_ea_fid_get(env, obj, ino, fid); - } else + } else { rc = -ENOENT; + } - cfs_up_read(&obj->oo_ext_idx_sem); + if (hlock != NULL) + ldiskfs_htree_unlock(hlock); + else + cfs_up_read(&obj->oo_ext_idx_sem); RETURN (rc); } @@ -3195,9 +3284,7 @@ static int osd_index_ea_insert(const struct lu_env *env, struct dt_object *dt, else cfs_cap_lower(CFS_CAP_SYS_RESOURCE); #endif - cfs_down_write(&obj->oo_ext_idx_sem); rc = osd_ea_add_rec(env, obj, child->oo_inode, name, rec, th); - cfs_up_write(&obj->oo_ext_idx_sem); #ifdef HAVE_QUOTA_SUPPORT cfs_curproc_cap_unpack(save); #endif @@ -3616,22 +3703,34 @@ static int osd_ldiskfs_filldir(char *buf, const char *name, int namelen, * \retval 0 on success * \retval -ve on error */ -static int osd_ldiskfs_it_fill(const struct dt_it *di) +static int osd_ldiskfs_it_fill(const struct lu_env *env, + const struct dt_it *di) { struct osd_it_ea *it = (struct osd_it_ea *)di; struct osd_object *obj = it->oie_obj; struct inode *inode = obj->oo_inode; - int result = 0; + struct htree_lock *hlock = NULL; + int result = 0; ENTRY; it->oie_dirent = it->oie_buf; it->oie_rd_dirent = 0; - cfs_down_read(&obj->oo_ext_idx_sem); + if (obj->oo_hl_head != NULL) { + hlock = osd_oti_get(env)->oti_hlock; + ldiskfs_htree_lock(hlock, obj->oo_hl_head, + inode, LDISKFS_HLOCK_READDIR); + } else { + cfs_down_read(&obj->oo_ext_idx_sem); + } + result = inode->i_fop->readdir(&it->oie_file, it, (filldir_t) osd_ldiskfs_filldir); - cfs_up_read(&obj->oo_ext_idx_sem); + if (hlock != NULL) + ldiskfs_htree_unlock(hlock); + else + cfs_up_read(&obj->oo_ext_idx_sem); if (it->oie_rd_dirent == 0) { result = -EIO; @@ -3672,7 +3771,7 @@ static int osd_it_ea_next(const struct lu_env *env, struct dt_it *di) if (it->oie_file.f_pos == LDISKFS_HTREE_EOF) rc = +1; else - rc = osd_ldiskfs_it_fill(di); + rc = osd_ldiskfs_it_fill(env, di); } RETURN(rc); @@ -3777,7 +3876,7 @@ static int osd_it_ea_load(const struct lu_env *env, ENTRY; it->oie_file.f_pos = hash; - rc = osd_ldiskfs_it_fill(di); + rc = osd_ldiskfs_it_fill(env, di); if (rc == 0) rc = +1; @@ -3842,19 +3941,26 @@ static void *osd_key_init(const struct lu_context *ctx, struct osd_thread_info *info; OBD_ALLOC_PTR(info); - if (info != NULL) { - OBD_ALLOC(info->oti_it_ea_buf, OSD_IT_EA_BUFSIZE); - if (info->oti_it_ea_buf != NULL) { - info->oti_env = container_of(ctx, struct lu_env, - le_ctx); - } else { - OBD_FREE_PTR(info); - info = ERR_PTR(-ENOMEM); - } - } else { - info = ERR_PTR(-ENOMEM); - } + if (info == NULL) + return ERR_PTR(-ENOMEM); + + OBD_ALLOC(info->oti_it_ea_buf, OSD_IT_EA_BUFSIZE); + if (info->oti_it_ea_buf == NULL) + goto out_free_info; + + info->oti_env = container_of(ctx, struct lu_env, le_ctx); + + info->oti_hlock = ldiskfs_htree_lock_alloc(); + if (info->oti_hlock == NULL) + goto out_free_ea; + return info; + + out_free_ea: + OBD_FREE(info->oti_it_ea_buf, OSD_IT_EA_BUFSIZE); + out_free_info: + OBD_FREE_PTR(info); + return ERR_PTR(-ENOMEM); } static void osd_key_fini(const struct lu_context *ctx, @@ -3862,6 +3968,8 @@ static void osd_key_fini(const struct lu_context *ctx, { struct osd_thread_info *info = data; + if (info->oti_hlock != NULL) + ldiskfs_htree_lock_free(info->oti_hlock); OBD_FREE(info->oti_it_ea_buf, OSD_IT_EA_BUFSIZE); OBD_FREE_PTR(info); } diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 5b3eaa4..662f7a1 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -98,6 +98,55 @@ struct osd_ctxt { }; #endif +#ifdef HAVE_LDISKFS_PDO + +#define osd_ldiskfs_find_entry(dir, dentry, de, lock) \ + ll_ldiskfs_find_entry(dir, dentry, de, lock) +#define osd_ldiskfs_add_entry(handle, child, cinode, hlock) \ + ldiskfs_add_entry(handle, child, cinode, hlock) + +#else /* HAVE_LDISKFS_PDO */ + +struct htree_lock { + int dummy; +}; + +struct htree_lock_head { + int dummy; +}; + +#define ldiskfs_htree_lock(lock, head, inode, op) do { LBUG(); } while (0) +#define ldiskfs_htree_unlock(lock) do { LBUG(); } while (0) + +static inline struct htree_lock_head *ldiskfs_htree_lock_head_alloc(int dep) +{ + LBUG(); + return NULL; +} + +#define ldiskfs_htree_lock_head_free(lh) do { LBUG(); } while (0) + +#define LDISKFS_DUMMY_HTREE_LOCK 0xbabecafe + +static inline struct htree_lock *ldiskfs_htree_lock_alloc(void) +{ + return (struct htree_lock *)LDISKFS_DUMMY_HTREE_LOCK; +} + +static inline void ldiskfs_htree_lock_free(struct htree_lock *lk) +{ + LASSERT((unsigned long)lk == LDISKFS_DUMMY_HTREE_LOCK); +} + +#define HTREE_HBITS_DEF 0 + +#define osd_ldiskfs_find_entry(dir, dentry, de, lock) \ + ll_ldiskfs_find_entry(dir, dentry, de) +#define osd_ldiskfs_add_entry(handle, child, cinode, lock) \ + ldiskfs_add_entry(handle, child, cinode) + +#endif /* HAVE_LDISKFS_PDO */ + /* * osd device. */ @@ -219,6 +268,7 @@ struct osd_thread_info { /** dentry for Iterator context. */ struct dentry oti_it_dentry; + struct htree_lock *oti_hlock; struct lu_fid oti_fid; struct osd_inode_id oti_id; @@ -289,6 +339,8 @@ struct osd_thread_info { char oti_ldp2[OSD_FID_REC_SZ]; }; +extern int ldiskfs_pdo; + #ifdef LPROCFS /* osd_lproc.c */ void lprocfs_osd_init_vars(struct lprocfs_static_vars *lvars); diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index a8862e6..6f1bbac 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -263,6 +263,31 @@ static int lprocfs_osd_rd_mntdev(char *page, char **start, off_t off, int count, osd->od_mount->lmi_mnt->mnt_devname); } +#ifdef HAVE_LDISKFS_PDO +static int lprocfs_osd_rd_pdo(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + *eof = 1; + + return snprintf(page, count, "%s\n", ldiskfs_pdo ? "ON" : "OFF"); +} + +static int lprocfs_osd_wr_pdo(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + int pdo; + int rc; + + rc = lprocfs_write_helper(buffer, count, &pdo); + if (rc != 0) + return rc; + + ldiskfs_pdo = !!pdo; + + return count; +} +#endif + struct lprocfs_vars lprocfs_osd_obd_vars[] = { { "blocksize", lprocfs_osd_rd_blksize, 0, 0 }, { "kbytestotal", lprocfs_osd_rd_kbytestotal, 0, 0 }, @@ -272,6 +297,9 @@ struct lprocfs_vars lprocfs_osd_obd_vars[] = { { "filesfree", lprocfs_osd_rd_filesfree, 0, 0 }, { "fstype", lprocfs_osd_rd_fstype, 0, 0 }, { "mntdev", lprocfs_osd_rd_mntdev, 0, 0 }, +#ifdef HAVE_LDISKFS_PDO + { "pdo", lprocfs_osd_rd_pdo, lprocfs_osd_wr_pdo, 0 }, +#endif { 0 } };