Whamcloud - gitweb
LU-1512 oi: reuse idle OI blocks
authorFan Yong <yong.fan@whamcloud.com>
Thu, 23 Aug 2012 15:41:51 +0000 (23:41 +0800)
committerOleg Drokin <green@whamcloud.com>
Fri, 31 Aug 2012 16:07:06 +0000 (12:07 -0400)
With the system running, some OI blocks may become empty because
related files have been removed. Originally, these empty blocks
almost cannot be reused beause it always allocates new block for
new leaf or index node. Such space waste is an important reason
for the OI file size increasing quickly.

Now, when the non-released OI leaf node becomes empty, it will be
recycled into a per-OI based idle blocks pool, which is organized
as a series of tables. For each table, the on-disk format is:

+---------+---------+---------+---------+------+---------+-------+
|  magic  |  count  |  next   |  logic  |      |  logic  | free  |
|(16 bits)|(16 bits)|  table  |  blk #  | .... |  blk #  | space |
|         |         |(32 bits)|(32 bits)|      |(32 bits)|       |
+---------+---------+---------+---------+------+---------+-------+

The logic blk# for the first table is stored as "idle_blocks" in
the OI root node. In fact, there is padding field in the OI root
node after the "dx_countlimit". We will append the "idle_blocks"
just after the "dx_countlimit" to reuse part of the padding. So
it will not cause compatibility issues with old OI files.

Adjust the strategy for OI file block allocation: reuse empty block
in its idle blocks pool with priority; if such pool is empty, then
allocate new block from system volume.

Signed-off-by: Fan Yong <yong.fan@whamcloud.com>
Change-Id: I1bccc1cd9a4e1a16e4f51ac18e6b995ec85a0cf8
Reviewed-on: http://review.whamcloud.com/3153
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: wangdi <di.wang@whamcloud.com>
Tested-by: Hudson
Reviewed-by: Liang Zhen <liang@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
lustre/fid/fid_request.c
lustre/include/obd_support.h
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_iam.c
lustre/osd-ldiskfs/osd_iam.h
lustre/osd-ldiskfs/osd_iam_lfix.c
lustre/osd-ldiskfs/osd_iam_lvar.c
lustre/tests/sanity.sh

index 6a177b4..bc0f46b 100644 (file)
@@ -309,6 +309,9 @@ int seq_client_alloc_fid(const struct lu_env *env,
         cfs_waitlink_init(&link);
         cfs_mutex_lock(&seq->lcs_mutex);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST))
+               seq->lcs_fid.f_oid = seq->lcs_width;
+
         while (1) {
                 seqno_t seqnr;
 
index ccca901..f87ecab 100644 (file)
@@ -412,6 +412,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 
 #define OBD_FAIL_SEQ                     0x1000
 #define OBD_FAIL_SEQ_QUERY_NET           0x1001
+#define OBD_FAIL_SEQ_EXHAUST            0x1002
 
 #define OBD_FAIL_FLD                     0x1100
 #define OBD_FAIL_FLD_QUERY_NET           0x1101
index 54e631e..81c00e6 100644 (file)
@@ -2688,17 +2688,6 @@ static int osd_iam_container_init(const struct lu_env *env,
                 return result;
 
         result = iam_container_setup(bag);
-        if (result != 0)
-                goto out;
-
-        if (osd_obj2dev(obj)->od_iop_mode) {
-                u32 ptr = bag->ic_descr->id_ops->id_root_ptr(bag);
-
-                bag->ic_root_bh = ldiskfs_bread(NULL, obj->oo_inode,
-                                                ptr, 0, &result);
-        }
-
- out:
         if (result == 0)
                 obj->oo_dt.do_index_ops = &osd_index_iam_ops;
         else
index 802f4f9..2d3e80e 100644 (file)
  * Format of a leaf node is not specified. Generic iam code accesses leaf
  * nodes through ->id_leaf methods in struct iam_descr.
  *
+ * The IAM root block is a special node, which contains the IAM descriptor.
+ * It is on disk format:
+ *
+ * +---------+-------+--------+---------+-------+------+-------+------------+
+ * |IAM desc | count |  idle  |         |       |      |       |            |
+ * |(fix/var)|   /   | blocks | padding | entry | .... | entry | free space |
+ * |         | limit |        |         |       |      |       |            |
+ * +---------+-------+--------+---------+-------+------+-------+------------+
+ *
+ * The padding length is calculated with the parameters in the IAM descriptor.
+ *
+ * The field "idle_blocks" is used to record empty leaf nodes, which have not
+ * been released but all contained entries in them have been removed. Usually,
+ * the idle blocks in the IAM should be reused when need to allocate new leaf
+ * nodes for new entries, it depends on the IAM hash functions to map the new
+ * entries to these idle blocks. Unfortunately, it is not easy to design some
+ * hash functions for such clever mapping, especially considering the insert/
+ * lookup performance.
+ *
+ * So the IAM recycles the empty leaf nodes, and put them into a per-file based
+ * idle blocks pool. If need some new leaf node, it will try to take idle block
+ * from such pool with priority, in spite of how the IAM hash functions to map
+ * the entry.
+ *
+ * The idle blocks pool is organized as a series of tables, and each table
+ * can be described as following (on-disk format):
+ *
+ * +---------+---------+---------+---------+------+---------+-------+
+ * |  magic  |  count  |  next   |  logic  |      |  logic  | free  |
+ * |(16 bits)|(16 bits)|  table  |  blk #  | .... |  blk #  | space |
+ * |         |         |(32 bits)|(32 bits)|      |(32 bits)|       |
+ * +---------+---------+---------+---------+------+---------+-------+
+ *
+ * The logic blk# for the first table is stored in the root node "idle_blocks".
+ *
  */
 
 #include <linux/module.h>
@@ -130,6 +165,40 @@ void iam_format_register(struct iam_format *fmt)
 }
 EXPORT_SYMBOL(iam_format_register);
 
+static struct buffer_head *
+iam_load_idle_blocks(struct iam_container *c, iam_ptr_t blk)
+{
+       struct inode *inode = c->ic_object;
+       struct iam_idle_head *head;
+       struct buffer_head *bh;
+       int err;
+
+       LASSERT_SEM_LOCKED(&c->ic_idle_sem);
+
+       if (blk == 0)
+               return NULL;
+
+       bh = ldiskfs_bread(NULL, inode, blk, 0, &err);
+       if (bh == NULL) {
+               CERROR("%.16s: cannot load idle blocks, blk = %u, err = %d\n",
+                      LDISKFS_SB(inode->i_sb)->s_es->s_volume_name, blk, err);
+               c->ic_idle_failed = 1;
+               return ERR_PTR(err);
+       }
+
+       head = (struct iam_idle_head *)(bh->b_data);
+       if (le16_to_cpu(head->iih_magic) != IAM_IDLE_HEADER_MAGIC) {
+               CERROR("%.16s: invalid idle block head, blk = %u, magic = %d\n",
+                      LDISKFS_SB(inode->i_sb)->s_es->s_volume_name, blk,
+                      le16_to_cpu(head->iih_magic));
+               brelse(bh);
+               c->ic_idle_failed = 1;
+               return ERR_PTR(-EBADF);
+       }
+
+       return bh;
+}
+
 /*
  * Determine format of given container. This is done by scanning list of
  * registered formats and calling ->if_guess() method of each in turn.
@@ -158,21 +227,41 @@ static int iam_format_guess(struct iam_container *c)
                 if (result == 0)
                         break;
         }
-        return result;
+
+       if (result == 0) {
+               struct buffer_head *bh;
+               __u32 *idle_blocks;
+
+               LASSERT(c->ic_root_bh != NULL);
+
+               idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
+                                       c->ic_descr->id_root_gap +
+                                       sizeof(struct dx_countlimit));
+               cfs_down(&c->ic_idle_sem);
+               bh = iam_load_idle_blocks(c, le32_to_cpu(*idle_blocks));
+               if (bh != NULL && IS_ERR(bh))
+                       result = PTR_ERR(bh);
+               else
+                       c->ic_idle_bh = bh;
+               cfs_up(&c->ic_idle_sem);
+       }
+
+       return result;
 }
 
 /*
  * Initialize container @c.
  */
 int iam_container_init(struct iam_container *c,
-                       struct iam_descr *descr, struct inode *inode)
+                      struct iam_descr *descr, struct inode *inode)
 {
-        memset(c, 0, sizeof *c);
-        c->ic_descr  = descr;
-        c->ic_object = inode;
-        cfs_init_rwsem(&c->ic_sem);
+       memset(c, 0, sizeof *c);
+       c->ic_descr  = descr;
+       c->ic_object = inode;
+       cfs_init_rwsem(&c->ic_sem);
        dynlock_init(&c->ic_tree_lock);
-        return 0;
+       cfs_sema_init(&c->ic_idle_sem, 1);
+       return 0;
 }
 EXPORT_SYMBOL(iam_container_init);
 
@@ -190,8 +279,10 @@ EXPORT_SYMBOL(iam_container_setup);
  */
 void iam_container_fini(struct iam_container *c)
 {
-        brelse(c->ic_root_bh);
-        c->ic_root_bh = NULL;
+       brelse(c->ic_idle_bh);
+       c->ic_idle_bh = NULL;
+       brelse(c->ic_root_bh);
+       c->ic_root_bh = NULL;
 }
 EXPORT_SYMBOL(iam_container_fini);
 
@@ -456,6 +547,11 @@ void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr)
         iam_leaf_ops(l)->split(l, bh, nr);
 }
 
+static inline int iam_leaf_empty(struct iam_leaf *l)
+{
+       return iam_leaf_ops(l)->leaf_empty(l);
+}
+
 int iam_leaf_can_add(const struct iam_leaf *l,
                      const struct iam_key *k, const struct iam_rec *r)
 {
@@ -1225,7 +1321,10 @@ static int iam_htree_advance(struct inode *dir, __u32 hash,
         while (1) {
                 do_corr(schedule());
                 iam_lock_bh(p->bh);
-                p->at = iam_entry_shift(path, p->at, +1);
+               if (p->at_shifted)
+                       p->at_shifted = 0;
+               else
+                       p->at = iam_entry_shift(path, p->at, +1);
                 if (p->at < iam_entry_shift(path, p->entries,
                                             dx_get_count(p->entries))) {
                         p->leaf = dx_get_block(path, p->at);
@@ -1558,11 +1657,104 @@ int iam_it_key_size(const struct iam_iterator *it)
 }
 EXPORT_SYMBOL(iam_it_key_size);
 
+struct buffer_head *
+iam_new_node(handle_t *h, struct iam_container *c, iam_ptr_t *b, int *e)
+{
+       struct inode *inode = c->ic_object;
+       struct buffer_head *bh = NULL;
+       struct iam_idle_head *head;
+       struct buffer_head *idle;
+       __u32 *idle_blocks;
+       __u16 count;
+
+       if (c->ic_idle_bh == NULL)
+               goto newblock;
+
+       cfs_down(&c->ic_idle_sem);
+       if (unlikely(c->ic_idle_failed || c->ic_idle_bh == NULL)) {
+               cfs_up(&c->ic_idle_sem);
+               goto newblock;
+       }
+
+       head = (struct iam_idle_head *)(c->ic_idle_bh->b_data);
+       count = le16_to_cpu(head->iih_count);
+       if (count > 0) {
+               *e = ldiskfs_journal_get_write_access(h, c->ic_idle_bh);
+               if (*e != 0)
+                       goto fail;
+
+               --count;
+               *b = le32_to_cpu(head->iih_blks[count]);
+               head->iih_count = cpu_to_le16(count);
+               *e = ldiskfs_journal_dirty_metadata(h, c->ic_idle_bh);
+               if (*e != 0)
+                       goto fail;
+
+               cfs_up(&c->ic_idle_sem);
+               bh = ldiskfs_bread(NULL, inode, *b, 0, e);
+               if (bh == NULL)
+                       return NULL;
+               goto got;
+       }
+
+       /* The block itself which contains the iam_idle_head is
+        * also an idle block, and can be used as the new node. */
+       idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
+                               c->ic_descr->id_root_gap +
+                               sizeof(struct dx_countlimit));
+       *e = ldiskfs_journal_get_write_access(h, c->ic_root_bh);
+       if (*e != 0)
+               goto fail;
+
+       *b = le32_to_cpu(*idle_blocks);
+       iam_lock_bh(c->ic_root_bh);
+       *idle_blocks = head->iih_next;
+       iam_unlock_bh(c->ic_root_bh);
+       *e = ldiskfs_journal_dirty_metadata(h, c->ic_root_bh);
+       if (*e != 0) {
+               iam_lock_bh(c->ic_root_bh);
+               *idle_blocks = cpu_to_le32(*b);
+               iam_unlock_bh(c->ic_root_bh);
+               goto fail;
+       }
+
+       bh = c->ic_idle_bh;
+       idle = iam_load_idle_blocks(c, le32_to_cpu(*idle_blocks));
+       if (idle != NULL && IS_ERR(idle)) {
+               *e = PTR_ERR(idle);
+               c->ic_idle_bh = NULL;
+               brelse(bh);
+               goto fail;
+       }
+
+       c->ic_idle_bh = idle;
+       cfs_up(&c->ic_idle_sem);
+
+got:
+       /* get write access for the found buffer head */
+       *e = ldiskfs_journal_get_write_access(h, bh);
+       if (*e != 0) {
+               brelse(bh);
+               bh = NULL;
+               ldiskfs_std_error(inode->i_sb, *e);
+       }
+       return bh;
+
+newblock:
+       bh = ldiskfs_append(h, inode, b, e);
+       return bh;
+
+fail:
+       cfs_up(&c->ic_idle_sem);
+       ldiskfs_std_error(inode->i_sb, *e);
+       return NULL;
+}
+
 /*
  * Insertion of new record. Interaction with jbd during non-trivial case (when
  * split happens) is as following:
  *
- *  - new leaf node is involved into transaction by ldiskfs_append();
+ *  - new leaf node is involved into transaction by iam_new_node();
  *
  *  - old leaf node is involved into transaction by iam_add_rec();
  *
@@ -1600,7 +1792,7 @@ static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
         path = leaf->il_path;
 
         obj = c->ic_object;
-        new_leaf = ldiskfs_append(handle, obj, (__u32 *)&blknr, &err);
+       new_leaf = iam_new_node(handle, c, &blknr, &err);
         do_corr(schedule());
         if (new_leaf != NULL) {
                 struct dynlock_handle *lh;
@@ -1791,7 +1983,8 @@ int split_index_node(handle_t *handle, struct iam_path *path,
         /* Go back down, allocating blocks, locking them, and adding into
          * transaction... */
         for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
-                bh_new[i] = ldiskfs_append (handle, dir, &newblock[i], &err);
+               bh_new[i] = iam_new_node(handle, path->ip_container,
+                                        &newblock[i], &err);
                 do_corr(schedule());
                 if (!bh_new[i] ||
                     descr->id_ops->id_node_init(path->ip_container,
@@ -2051,6 +2244,165 @@ int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
 }
 EXPORT_SYMBOL(iam_it_rec_insert);
 
+static inline int iam_idle_blocks_limit(struct inode *inode)
+{
+       return (inode->i_sb->s_blocksize - sizeof(struct iam_idle_head)) >> 2;
+}
+
+/*
+ * If the leaf cannnot be recycled, we will lose one block for reusing.
+ * It is not a serious issue because it almost the same of non-recycle.
+ */
+static iam_ptr_t iam_index_shrink(handle_t *h, struct iam_path *p,
+                                 struct iam_leaf *l, struct buffer_head **bh)
+{
+       struct iam_container *c = p->ip_container;
+       struct inode *inode = c->ic_object;
+       struct iam_frame *frame = p->ip_frame;
+       struct iam_entry *entries;
+       struct dynlock_handle *lh;
+       int count;
+       int rc;
+
+       if (c->ic_idle_failed)
+               return 0;
+
+       if (unlikely(frame == NULL))
+               return 0;
+
+       if (!iam_leaf_empty(l))
+               return 0;
+
+       lh = iam_lock_htree(c, frame->curidx, DLT_WRITE);
+       if (lh == NULL) {
+               CWARN("%.16s: No memory to recycle idle blocks\n",
+                     LDISKFS_SB(inode->i_sb)->s_es->s_volume_name);
+               return 0;
+       }
+
+       rc = iam_txn_add(h, p, frame->bh);
+       if (rc != 0) {
+               iam_unlock_htree(c, lh);
+               return 0;
+       }
+
+       iam_lock_bh(frame->bh);
+       entries = frame->entries;
+       count = dx_get_count(entries);
+       if (frame->at < iam_entry_shift(p, entries, count - 1)) {
+               struct iam_entry *n = iam_entry_shift(p, frame->at, 1);
+
+               memmove(frame->at, n,
+                       (char *)iam_entry_shift(p, entries, count) - (char *)n);
+               frame->at_shifted = 1;
+       }
+       dx_set_count(entries, count - 1);
+       rc = iam_txn_dirty(h, p, frame->bh);
+       iam_unlock_bh(frame->bh);
+       iam_unlock_htree(c, lh);
+       if (rc != 0)
+               return 0;
+
+       get_bh(l->il_bh);
+       *bh = l->il_bh;
+       return frame->leaf;
+}
+
+static int
+iam_install_idle_blocks(handle_t *h, struct iam_path *p, struct buffer_head *bh,
+                       __u32 *idle_blocks, iam_ptr_t blk)
+{
+       struct iam_container *c = p->ip_container;
+       struct buffer_head *old = c->ic_idle_bh;
+       struct iam_idle_head *head;
+       int rc;
+
+       rc = iam_txn_add(h, p, bh);
+       if (rc != 0)
+               return rc;
+
+       head = (struct iam_idle_head *)(bh->b_data);
+       head->iih_magic = cpu_to_le16(IAM_IDLE_HEADER_MAGIC);
+       head->iih_count = 0;
+       head->iih_next = *idle_blocks;
+       rc = iam_txn_dirty(h, p, bh);
+       if (rc != 0)
+               return rc;
+
+       rc = iam_txn_add(h, p, c->ic_root_bh);
+       if (rc != 0)
+               return rc;
+
+       iam_lock_bh(c->ic_root_bh);
+       *idle_blocks = cpu_to_le32(blk);
+       iam_unlock_bh(c->ic_root_bh);
+       rc = iam_txn_dirty(h, p, c->ic_root_bh);
+       if (rc == 0) {
+               /* NOT release old before new assigned. */
+               get_bh(bh);
+               c->ic_idle_bh = bh;
+               brelse(old);
+       } else {
+               iam_lock_bh(c->ic_root_bh);
+               *idle_blocks = head->iih_next;
+               iam_unlock_bh(c->ic_root_bh);
+       }
+       return rc;
+}
+
+/*
+ * If the leaf cannnot be recycled, we will lose one block for reusing.
+ * It is not a serious issue because it almost the same of non-recycle.
+ */
+static void iam_recycle_leaf(handle_t *h, struct iam_path *p,
+                            struct buffer_head *bh, iam_ptr_t blk)
+{
+       struct iam_container *c = p->ip_container;
+       struct inode *inode = c->ic_object;
+       struct iam_idle_head *head;
+       __u32 *idle_blocks;
+       int count;
+       int rc;
+
+       cfs_down(&c->ic_idle_sem);
+       if (unlikely(c->ic_idle_failed)) {
+               rc = -EFAULT;
+               goto unlock;
+       }
+
+       idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
+                               c->ic_descr->id_root_gap +
+                               sizeof(struct dx_countlimit));
+       /* It is the first idle block. */
+       if (c->ic_idle_bh == NULL) {
+               rc = iam_install_idle_blocks(h, p, bh, idle_blocks, blk);
+               goto unlock;
+       }
+
+       head = (struct iam_idle_head *)(c->ic_idle_bh->b_data);
+       count = le16_to_cpu(head->iih_count);
+       /* Current ic_idle_bh is full, to be replaced by the leaf. */
+       if (count == iam_idle_blocks_limit(inode)) {
+               rc = iam_install_idle_blocks(h, p, bh, idle_blocks, blk);
+               goto unlock;
+       }
+
+       /* Just add to ic_idle_bh. */
+       rc = iam_txn_add(h, p, c->ic_idle_bh);
+       if (rc != 0)
+               goto unlock;
+
+       head->iih_blks[count] = cpu_to_le32(blk);
+       head->iih_count = cpu_to_le16(count + 1);
+       rc = iam_txn_dirty(h, p, c->ic_idle_bh);
+
+unlock:
+       cfs_up(&c->ic_idle_sem);
+       if (rc != 0)
+               CWARN("%.16s: idle blocks failed, will lose the blk %u\n",
+                     LDISKFS_SB(inode->i_sb)->s_es->s_volume_name, blk);
+}
+
 /*
  * Delete record under iterator.
  *
@@ -2083,12 +2435,22 @@ int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
         if (result == 0) {
                 iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE);
                 result = iam_txn_dirty(h, path, leaf->il_bh);
-                if (result == 0 && iam_leaf_at_end(leaf) &&
-                    it->ii_flags&IAM_IT_MOVE) {
-                        result = iam_it_next(it);
-                        if (result > 0)
-                                result = 0;
-                }
+               if (result == 0 && iam_leaf_at_end(leaf)) {
+                       struct buffer_head *bh = NULL;
+                       iam_ptr_t blk;
+
+                       blk = iam_index_shrink(h, path, leaf, &bh);
+                       if (it->ii_flags & IAM_IT_MOVE) {
+                               result = iam_it_next(it);
+                               if (result > 0)
+                                       result = 0;
+                       }
+
+                       if (bh != NULL) {
+                               iam_recycle_leaf(h, path, bh, blk);
+                               brelse(bh);
+                       }
+               }
         }
         assert_inv(iam_leaf_check(leaf));
         assert_inv(iam_path_check(path));
index 293f2e3..f55a11b 100644 (file)
@@ -219,6 +219,9 @@ struct iam_frame {
         iam_ptr_t         curidx;  /* (logical) offset of this node. Used to
                                     * per-node locking to detect concurrent
                                     * splits. */
+       unsigned int      at_shifted:1; /* The "at" entry has moved to next
+                                        * because of shrinking index node
+                                        * for recycling empty leaf node. */
 };
 
 /*
@@ -403,6 +406,10 @@ struct iam_leaf_operations {
          */
         void (*split)(struct iam_leaf *l, struct buffer_head **bh,
                       iam_ptr_t newblknr);
+       /*
+        * the leaf is empty?
+        */
+       int (*leaf_empty)(struct iam_leaf *l);
 };
 
 /*
@@ -441,6 +448,20 @@ struct iam_descr {
         struct iam_leaf_operations      *id_leaf_ops;
 };
 
+enum {
+       IAM_IDLE_HEADER_MAGIC = 0x7903,
+};
+
+/*
+ * Header structure to record idle blocks.
+ */
+struct iam_idle_head {
+       __le16 iih_magic;
+       __le16 iih_count; /* how many idle blocks in this head */
+       __le32 iih_next; /* next head for idle blocks */
+       __le32 iih_blks[0];
+};
+
 /*
  * An instance of iam container.
  */
@@ -463,6 +484,15 @@ struct iam_container {
          */
         cfs_rw_semaphore_t   ic_sem;
        struct dynlock       ic_tree_lock;
+       /*
+        * Protect ic_idle_bh
+        */
+       cfs_semaphore_t      ic_idle_sem;
+       /*
+        * BH for idle blocks
+        */
+       struct buffer_head  *ic_idle_bh;
+       unsigned int         ic_idle_failed:1; /* Idle block mechanism failed */
 };
 
 /*
@@ -820,8 +850,8 @@ struct fake_dirent {
 };
 
 struct dx_countlimit {
-        __le16 limit;
-        __le16 count;
+       __le16 limit;
+       __le16 count;
 };
 
 /*
index f676794..efea0b3 100644 (file)
@@ -511,6 +511,11 @@ static void iam_lfix_split(struct iam_leaf *l, struct buffer_head **bh,
         iam_insert_key_lock(path, path->ip_frame, pivot, new_blknr);
 }
 
+static int iam_lfix_leaf_empty(struct iam_leaf *leaf)
+{
+       return lentry_count_get(leaf) == 0;
+}
+
 static struct iam_leaf_operations iam_lfix_leaf_ops = {
         .init           = iam_lfix_init,
         .init_new       = iam_lfix_init_new,
@@ -533,7 +538,8 @@ static struct iam_leaf_operations iam_lfix_leaf_ops = {
         .rec_add        = iam_lfix_rec_add,
         .rec_del        = iam_lfix_rec_del,
         .can_add        = iam_lfix_can_add,
-        .split          = iam_lfix_split
+       .split          = iam_lfix_split,
+       .leaf_empty     = iam_lfix_leaf_empty,
 };
 
 /*
@@ -695,11 +701,14 @@ static int iam_lfix_guess(struct iam_container *c)
                         descr->id_node_gap  = 0;
                         descr->id_ops       = &iam_lfix_ops;
                         descr->id_leaf_ops  = &iam_lfix_leaf_ops;
-                } else
-                        result = -EBADF;
-                brelse(bh);
-        }
-        return result;
+
+                       c->ic_root_bh = bh;
+               } else {
+                       result = -EBADF;
+                       brelse(bh);
+               }
+       }
+       return result;
 }
 
 static struct iam_format iam_lfix_format = {
@@ -785,6 +794,18 @@ static void lfix_root(void *buf,
                                         blocksize, keysize + ptrsize)
         };
 
+       /* To guarantee that the padding "keysize + ptrsize"
+        * covers the "dx_countlimit" and the "idle_blocks". */
+       LASSERT((keysize + ptrsize) >=
+               (sizeof(struct dx_countlimit) + sizeof(__u32)));
+
+       entry = limit + 1;
+       /* Put "idle_blocks" just after the limit. There was padding after
+        * the limit, the "idle_blocks" re-uses part of the padding, so no
+        * compatibility issues with old layout.
+        */
+       *(__u32 *)entry = 0;
+
         entry = root + 1;
         /*
          * Skip over @limit.
@@ -799,18 +820,20 @@ static void lfix_root(void *buf,
          * XXX: this key is hard-coded to be a sequence of 0's.
          */
 
-        entry += keysize;
-        /* now @entry points to <ptr> */
-        if (ptrsize == 4)
-                STORE_UNALIGNED(cpu_to_le32(1), (u_int32_t *)entry);
-        else
-                STORE_UNALIGNED(cpu_to_le64(1), (u_int64_t *)entry);
+       memset(entry, 0, keysize);
+       entry += keysize;
+       /* now @entry points to <ptr> */
+       if (ptrsize == 4)
+               STORE_UNALIGNED(cpu_to_le32(1), (u_int32_t *)entry);
+       else
+               STORE_UNALIGNED(cpu_to_le64(1), (u_int64_t *)entry);
 }
 
 static void lfix_leaf(void *buf,
-                      int blocksize, int keysize, int ptrsize, int recsize)
+                     int blocksize, int keysize, int ptrsize, int recsize)
 {
-        struct iam_leaf_head *head;
+       struct iam_leaf_head *head;
+       void *entry;
 
         /* form leaf */
         head = buf;
@@ -822,6 +845,9 @@ static void lfix_leaf(void *buf,
                  */
                 .ill_count = cpu_to_le16(1),
         };
+
+       entry = (void *)(head + 1);
+       memset(entry, 0, keysize + recsize);
 }
 
 int iam_lfix_create(struct inode *obj,
index 75068e5..9fc9fd7 100644 (file)
@@ -765,6 +765,11 @@ static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
         assert_inv(n_invariant(leaf));
 }
 
+static int lvar_leaf_empty(struct iam_leaf *leaf)
+{
+       return h_used(n_head(leaf)) == sizeof(struct lvar_leaf_header);
+}
+
 static struct iam_leaf_operations lvar_leaf_ops = {
         .init           = lvar_init,
         .init_new       = lvar_init_new,
@@ -787,7 +792,8 @@ static struct iam_leaf_operations lvar_leaf_ops = {
         .rec_add        = lvar_rec_add,
         .rec_del        = lvar_rec_del,
         .can_add        = lvar_can_add,
-        .split          = lvar_split
+       .split          = lvar_split,
+       .leaf_empty     = lvar_leaf_empty,
 };
 
 /*
@@ -938,6 +944,18 @@ static void lvar_root(void *buf,
                                         sizeof (lvar_hash_t) + ptrsize)
         };
 
+       /* To guarantee that the padding "keysize + ptrsize"
+        * covers the "dx_countlimit" and the "idle_blocks". */
+       LASSERT((keysize + ptrsize) >=
+               (sizeof(struct dx_countlimit) + sizeof(__u32)));
+
+       entry = limit + 1;
+       /* Put "idle_blocks" just after the limit. There was padding after
+        * the limit, the "idle_blocks" re-uses part of the padding, so no
+        * compatibility issues with old layout.
+        */
+       *(__u32 *)entry = 0;
+
         entry = root + 1;
         /*
          * Skip over @limit.
@@ -1054,11 +1072,14 @@ static int lvar_guess(struct iam_container *c)
                         descr->id_node_gap  = 0;
                         descr->id_ops       = &lvar_ops;
                         descr->id_leaf_ops  = &lvar_leaf_ops;
-                } else
-                        result = -EBADF;
-                brelse(bh);
-        }
-        return result;
+
+                       c->ic_root_bh = bh;
+               } else {
+                       result = -EBADF;
+                       brelse(bh);
+               }
+       }
+       return result;
 }
 
 static struct iam_format lvar_format = {
index cb0a589..e37c7f3 100644 (file)
@@ -9565,6 +9565,46 @@ test_227() {
 }
 run_test 227 "running truncated executable does not cause OOM"
 
+# LU-1512 try to reuse idle OI blocks
+test_228() {
+       [ "$FSTYPE" != "ldiskfs" ] && skip "non-ldiskfs backend" && return
+
+       local MDT_DEV=$(mdsdevname ${SINGLEMDS//mds/})
+       local myDIR=$DIR/$tdir
+
+       mkdir -p $myDIR
+       #define OBD_FAIL_SEQ_EXHAUST             0x1002
+       $LCTL set_param fail_loc=0x80001002
+       createmany -o $myDIR/t- 10000
+       $LCTL set_param fail_loc=0
+       # The guard is current the largest FID holder
+       touch $myDIR/guard
+       local SEQ=$($LFS path2fid $myDIR/guard | awk -F ':' '{print $1}' |
+                   tr -d '[')
+       local IDX=$(($SEQ % 64))
+
+       do_facet $SINGLEMDS sync
+       # Make sure journal flushed.
+       sleep 6
+       local blk1=$(do_facet $SINGLEMDS \
+                    "$DEBUGFS -c -R \\\"stat oi.16.${IDX}\\\" $MDT_DEV" |
+                    grep Blockcount | awk '{print $4}')
+
+       # Remove old files, some OI blocks will become idle.
+       unlinkmany $myDIR/t- 10000
+       # Create new files, idle OI blocks should be reused.
+       createmany -o $myDIR/t- 2000
+       do_facet $SINGLEMDS sync
+       # Make sure journal flushed.
+       sleep 6
+       local blk2=$(do_facet $SINGLEMDS \
+                    "$DEBUGFS -c -R \\\"stat oi.16.${IDX}\\\" $MDT_DEV" |
+                    grep Blockcount | awk '{print $4}')
+
+       [ $blk1 == $blk2 ] || error "old blk1=$blk1, new blk2=$blk2, unmatched!"
+}
+run_test 228 "try to reuse idle OI blocks"
+
 #
 # tests that do cleanup/setup should be run at the end
 #