Whamcloud - gitweb
96d8b479363c25619bc94170769642bd155e6082
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_iam.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * iam.c
33  * Top-level entry points into iam module
34  *
35  * Author: Wang Di <wangdi@clusterfs.com>
36  * Author: Nikita Danilov <nikita@clusterfs.com>
37  */
38
39 /*
40  * iam: big theory statement.
41  *
42  * iam (Index Access Module) is a module providing abstraction of persistent
43  * transactional container on top of generalized ldiskfs htree.
44  *
45  * iam supports:
46  *
47  *     - key, pointer, and record size specifiable per container.
48  *
49  *     - trees taller than 2 index levels.
50  *
51  *     - read/write to existing ldiskfs htree directories as iam containers.
52  *
53  * iam container is a tree, consisting of leaf nodes containing keys and
54  * records stored in this container, and index nodes, containing keys and
55  * pointers to leaf or index nodes.
56  *
57  * iam does not work with keys directly, instead it calls user-supplied key
58  * comparison function (->dpo_keycmp()).
59  *
60  * Pointers are (currently) interpreted as logical offsets (measured in
61  * blocksful) within underlying flat file on top of which iam tree lives.
62  *
63  * On-disk format:
64  *
65  * iam mostly tries to reuse existing htree formats.
66  *
67  * Format of index node:
68  *
69  * +-----+-------+-------+-------+------+-------+------------+
70  * |     | count |       |       |      |       |            |
71  * | gap |   /   | entry | entry | .... | entry | free space |
72  * |     | limit |       |       |      |       |            |
73  * +-----+-------+-------+-------+------+-------+------------+
74  *
75  *       gap           this part of node is never accessed by iam code. It
76  *                     exists for binary compatibility with ldiskfs htree (that,
77  *                     in turn, stores fake struct ext2_dirent for ext2
78  *                     compatibility), and to keep some unspecified per-node
79  *                     data. Gap can be different for root and non-root index
80  *                     nodes. Gap size can be specified for each container
81  *                     (gap of 0 is allowed).
82  *
83  *       count/limit   current number of entries in this node, and the maximal
84  *                     number of entries that can fit into node. count/limit
85  *                     has the same size as entry, and is itself counted in
86  *                     count.
87  *
88  *       entry         index entry: consists of a key immediately followed by
89  *                     a pointer to a child node. Size of a key and size of a
90  *                     pointer depends on container. Entry has neither
91  *                     alignment nor padding.
92  *
93  *       free space    portion of node new entries are added to
94  *
95  * Entries in index node are sorted by their key value.
96  *
97  * Format of a leaf node is not specified. Generic iam code accesses leaf
98  * nodes through ->id_leaf methods in struct iam_descr.
99  *
100  * The IAM root block is a special node, which contains the IAM descriptor.
101  * It is on disk format:
102  *
103  * +---------+-------+--------+---------+-------+------+-------+------------+
104  * |IAM desc | count |  idle  |         |       |      |       |            |
105  * |(fix/var)|   /   | blocks | padding | entry | .... | entry | free space |
106  * |         | limit |        |         |       |      |       |            |
107  * +---------+-------+--------+---------+-------+------+-------+------------+
108  *
109  * The padding length is calculated with the parameters in the IAM descriptor.
110  *
111  * The field "idle_blocks" is used to record empty leaf nodes, which have not
112  * been released but all contained entries in them have been removed. Usually,
113  * the idle blocks in the IAM should be reused when need to allocate new leaf
114  * nodes for new entries, it depends on the IAM hash functions to map the new
115  * entries to these idle blocks. Unfortunately, it is not easy to design some
116  * hash functions for such clever mapping, especially considering the insert/
117  * lookup performance.
118  *
119  * So the IAM recycles the empty leaf nodes, and put them into a per-file based
120  * idle blocks pool. If need some new leaf node, it will try to take idle block
121  * from such pool with priority, in spite of how the IAM hash functions to map
122  * the entry.
123  *
124  * The idle blocks pool is organized as a series of tables, and each table
125  * can be described as following (on-disk format):
126  *
127  * +---------+---------+---------+---------+------+---------+-------+
128  * |  magic  |  count  |  next   |  logic  |      |  logic  | free  |
129  * |(16 bits)|(16 bits)|  table  |  blk #  | .... |  blk #  | space |
130  * |         |         |(32 bits)|(32 bits)|      |(32 bits)|       |
131  * +---------+---------+---------+---------+------+---------+-------+
132  *
133  * The logic blk# for the first table is stored in the root node "idle_blocks".
134  *
135  */
136
137 #include <linux/module.h>
138 #include <linux/fs.h>
139 #include <linux/pagemap.h>
140 #include <linux/time.h>
141 #include <linux/fcntl.h>
142 #include <linux/stat.h>
143 #include <linux/string.h>
144 #include <linux/quotaops.h>
145 #include <linux/buffer_head.h>
146
147 #include <ldiskfs/ldiskfs.h>
148 #include <ldiskfs/xattr.h>
149 #undef ENTRY
150
151 #include "osd_internal.h"
152
153 #include <ldiskfs/acl.h>
154
155 static struct buffer_head *
156 iam_load_idle_blocks(struct iam_container *c, iam_ptr_t blk)
157 {
158         struct inode *inode = c->ic_object;
159         struct iam_idle_head *head;
160         struct buffer_head *bh;
161
162         LASSERT(mutex_is_locked(&c->ic_idle_mutex));
163
164         if (blk == 0)
165                 return NULL;
166
167         bh = __ldiskfs_bread(NULL, inode, blk, 0);
168         if (IS_ERR_OR_NULL(bh)) {
169                 CERROR("%s: cannot load idle blocks, blk = %u: rc = %ld\n",
170                        osd_ino2name(inode), blk, bh ? PTR_ERR(bh) : -EIO);
171                 c->ic_idle_failed = 1;
172                 if (bh == NULL)
173                         bh = ERR_PTR(-EIO);
174                 return bh;
175         }
176
177         head = (struct iam_idle_head *)(bh->b_data);
178         if (le16_to_cpu(head->iih_magic) != IAM_IDLE_HEADER_MAGIC) {
179                 int rc = -EBADF;
180
181                 CERROR("%s: invalid idle block head, blk = %u, magic = %x: rc = %d\n",
182                        osd_ino2name(inode), blk, le16_to_cpu(head->iih_magic),
183                        rc);
184                 brelse(bh);
185                 c->ic_idle_failed = 1;
186                 return ERR_PTR(rc);
187         }
188
189         return bh;
190 }
191
192 /*
193  * Determine format of given container. This is done by scanning list of
194  * registered formats and calling ->if_guess() method of each in turn.
195  */
196 static int iam_format_guess(struct iam_container *c)
197 {
198         int result;
199
200         result = iam_lvar_guess(c);
201         if (result)
202                 result = iam_lfix_guess(c);
203
204         if (result == 0) {
205                 struct buffer_head *bh;
206                 __u32 *idle_blocks;
207
208                 LASSERT(c->ic_root_bh != NULL);
209
210                 idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
211                                         c->ic_descr->id_root_gap +
212                                         sizeof(struct dx_countlimit));
213                 mutex_lock(&c->ic_idle_mutex);
214                 bh = iam_load_idle_blocks(c, le32_to_cpu(*idle_blocks));
215                 if (bh != NULL && IS_ERR(bh))
216                         result = PTR_ERR(bh);
217                 else
218                         c->ic_idle_bh = bh;
219                 mutex_unlock(&c->ic_idle_mutex);
220         }
221
222         return result;
223 }
224
225 /*
226  * Initialize container @c.
227  */
228 int iam_container_init(struct iam_container *c,
229                        struct iam_descr *descr, struct inode *inode)
230 {
231         memset(c, 0, sizeof *c);
232         c->ic_descr = descr;
233         c->ic_object = inode;
234         init_rwsem(&c->ic_sem);
235         dynlock_init(&c->ic_tree_lock);
236         mutex_init(&c->ic_idle_mutex);
237         return 0;
238 }
239
240 /*
241  * Determine container format.
242  */
243 int iam_container_setup(struct iam_container *c)
244 {
245         return iam_format_guess(c);
246 }
247
248 /*
249  * Finalize container @c, release all resources.
250  */
251 void iam_container_fini(struct iam_container *c)
252 {
253         brelse(c->ic_idle_bh);
254         c->ic_idle_bh = NULL;
255         brelse(c->ic_root_bh);
256         c->ic_root_bh = NULL;
257 }
258
259 void iam_path_init(struct iam_path *path, struct iam_container *c,
260                    struct iam_path_descr *pd)
261 {
262         memset(path, 0, sizeof *path);
263         path->ip_container = c;
264         path->ip_frame = path->ip_frames;
265         path->ip_data = pd;
266         path->ip_leaf.il_path = path;
267 }
268
269 static void iam_leaf_fini(struct iam_leaf *leaf);
270
271 void iam_path_release(struct iam_path *path)
272 {
273         int i;
274
275         for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
276                 if (path->ip_frames[i].bh != NULL) {
277                         path->ip_frames[i].at_shifted = 0;
278                         brelse(path->ip_frames[i].bh);
279                         path->ip_frames[i].bh = NULL;
280                 }
281         }
282 }
283
284 void iam_path_fini(struct iam_path *path)
285 {
286         iam_leaf_fini(&path->ip_leaf);
287         iam_path_release(path);
288 }
289
290
291 void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
292 {
293         int i;
294
295         path->ipc_hinfo = &path->ipc_hinfo_area;
296         for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
297                 path->ipc_descr.ipd_key_scratch[i] =
298                         (struct iam_ikey *)&path->ipc_scratch[i];
299
300         iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
301 }
302
303 void iam_path_compat_fini(struct iam_path_compat *path)
304 {
305         iam_path_fini(&path->ipc_path);
306 }
307
308 /*
309  * Helper function initializing iam_path_descr and its key scratch area.
310  */
311 struct iam_path_descr *iam_ipd_alloc(void *area, int keysize)
312 {
313         struct iam_path_descr *ipd;
314         void *karea;
315         int i;
316
317         ipd = area;
318         karea = ipd + 1;
319         for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize)
320                 ipd->ipd_key_scratch[i] = karea;
321         return ipd;
322 }
323
324 void iam_ipd_free(struct iam_path_descr *ipd)
325 {
326 }
327
328 int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
329                   handle_t *h, struct buffer_head **bh)
330 {
331         /*
332          * NB: it can be called by iam_lfix_guess() which is still at
333          * very early stage, c->ic_root_bh and c->ic_descr->id_ops
334          * haven't been intialized yet.
335          * Also, we don't have this for IAM dir.
336          */
337         if (c->ic_root_bh != NULL &&
338             c->ic_descr->id_ops->id_root_ptr(c) == ptr) {
339                 get_bh(c->ic_root_bh);
340                 *bh = c->ic_root_bh;
341                 return 0;
342         }
343
344         *bh = __ldiskfs_bread(h, c->ic_object, (int)ptr, 0);
345         if (IS_ERR(*bh))
346                 return PTR_ERR(*bh);
347
348         if (*bh == NULL)
349                 return -EIO;
350
351         return 0;
352 }
353
354 /*
355  * Return pointer to current leaf record. Pointer is valid while corresponding
356  * leaf node is locked and pinned.
357  */
358 static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf)
359 {
360         return iam_leaf_ops(leaf)->rec(leaf);
361 }
362
363 /*
364  * Return pointer to the current leaf key. This function returns pointer to
365  * the key stored in node.
366  *
367  * Caller should assume that returned pointer is only valid while leaf node is
368  * pinned and locked.
369  */
370 static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf)
371 {
372         return iam_leaf_ops(leaf)->key(leaf);
373 }
374
375 static int iam_leaf_key_size(const struct iam_leaf *leaf)
376 {
377         return iam_leaf_ops(leaf)->key_size(leaf);
378 }
379
380 static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf,
381                                       struct iam_ikey *key)
382 {
383         return iam_leaf_ops(leaf)->ikey(leaf, key);
384 }
385
386 static int iam_leaf_keycmp(const struct iam_leaf *leaf,
387                            const struct iam_key *key)
388 {
389         return iam_leaf_ops(leaf)->key_cmp(leaf, key);
390 }
391
392 static int iam_leaf_keyeq(const struct iam_leaf *leaf,
393                           const struct iam_key *key)
394 {
395         return iam_leaf_ops(leaf)->key_eq(leaf, key);
396 }
397
398 #if LDISKFS_INVARIANT_ON
399 static int iam_path_check(struct iam_path *p)
400 {
401         int i;
402         int result;
403         struct iam_frame *f;
404         struct iam_descr *param;
405
406         result = 1;
407         param = iam_path_descr(p);
408         for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) {
409                 f = &p->ip_frames[i];
410                 if (f->bh != NULL) {
411                         result = dx_node_check(p, f);
412                         if (result)
413                                 result = !param->id_ops->id_node_check(p, f);
414                 }
415         }
416         if (result && p->ip_leaf.il_bh != NULL)
417                 result = 1;
418         if (result == 0)
419                 ldiskfs_std_error(iam_path_obj(p)->i_sb, result);
420
421         return result;
422 }
423 #endif
424
425 static int iam_leaf_load(struct iam_path *path)
426 {
427         iam_ptr_t block;
428         int err;
429         struct iam_container *c;
430         struct buffer_head *bh;
431         struct iam_leaf *leaf;
432         struct iam_descr *descr;
433
434         c     = path->ip_container;
435         leaf  = &path->ip_leaf;
436         descr = iam_path_descr(path);
437         block = path->ip_frame->leaf;
438         if (block == 0) {
439                 /* XXX bug 11027 */
440                 printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n",
441                        (long unsigned)path->ip_frame->leaf,
442                        dx_get_count(dx_node_get_entries(path, path->ip_frame)),
443                        path->ip_frames[0].bh, path->ip_frames[1].bh,
444                        path->ip_frames[2].bh);
445         }
446         err = descr->id_ops->id_node_read(c, block, NULL, &bh);
447         if (err == 0) {
448                 leaf->il_bh = bh;
449                 leaf->il_curidx = block;
450                 err = iam_leaf_ops(leaf)->init(leaf);
451         }
452         return err;
453 }
454
455 static void iam_unlock_htree(struct iam_container *ic,
456                              struct dynlock_handle *lh)
457 {
458         if (lh != NULL)
459                 dynlock_unlock(&ic->ic_tree_lock, lh);
460 }
461
462
463 static void iam_leaf_unlock(struct iam_leaf *leaf)
464 {
465         if (leaf->il_lock != NULL) {
466                 iam_unlock_htree(iam_leaf_container(leaf),
467                                  leaf->il_lock);
468                 do_corr(schedule());
469                 leaf->il_lock = NULL;
470         }
471 }
472
473 static void iam_leaf_fini(struct iam_leaf *leaf)
474 {
475         if (leaf->il_path != NULL) {
476                 iam_leaf_unlock(leaf);
477                 iam_leaf_ops(leaf)->fini(leaf);
478                 if (leaf->il_bh) {
479                         brelse(leaf->il_bh);
480                         leaf->il_bh = NULL;
481                         leaf->il_curidx = 0;
482                 }
483         }
484 }
485
486 static void iam_leaf_start(struct iam_leaf *folio)
487 {
488         iam_leaf_ops(folio)->start(folio);
489 }
490
491 void iam_leaf_next(struct iam_leaf *folio)
492 {
493         iam_leaf_ops(folio)->next(folio);
494 }
495
496 static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key,
497                              const struct iam_rec *rec)
498 {
499         iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
500 }
501
502 static void iam_rec_del(struct iam_leaf *leaf, int shift)
503 {
504         iam_leaf_ops(leaf)->rec_del(leaf, shift);
505 }
506
507 int iam_leaf_at_end(const struct iam_leaf *leaf)
508 {
509         return iam_leaf_ops(leaf)->at_end(leaf);
510 }
511
512 static void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh,
513                            iam_ptr_t nr)
514 {
515         iam_leaf_ops(l)->split(l, bh, nr);
516 }
517
518 static inline int iam_leaf_empty(struct iam_leaf *l)
519 {
520         return iam_leaf_ops(l)->leaf_empty(l);
521 }
522
523 int iam_leaf_can_add(const struct iam_leaf *l,
524                      const struct iam_key *k, const struct iam_rec *r)
525 {
526         return iam_leaf_ops(l)->can_add(l, k, r);
527 }
528
529 static int iam_txn_dirty(handle_t *handle,
530                          struct iam_path *path, struct buffer_head *bh)
531 {
532         int result;
533
534         result = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
535         if (result != 0)
536                 ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
537         return result;
538 }
539
540 static int iam_txn_add(handle_t *handle,
541                        struct iam_path *path, struct buffer_head *bh)
542 {
543         int result;
544
545         result = ldiskfs_journal_get_write_access(handle, bh);
546         if (result != 0)
547                 ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
548         return result;
549 }
550
551 /***********************************************************************/
552 /* iterator interface                                                  */
553 /***********************************************************************/
554
555 static enum iam_it_state it_state(const struct iam_iterator *it)
556 {
557         return it->ii_state;
558 }
559
560 /*
561  * Helper function returning scratch key.
562  */
563 static struct iam_container *iam_it_container(const struct iam_iterator *it)
564 {
565         return it->ii_path.ip_container;
566 }
567
568 static inline int it_keycmp(const struct iam_iterator *it,
569                             const struct iam_key *k)
570 {
571         return iam_leaf_keycmp(&it->ii_path.ip_leaf, k);
572 }
573
574 static inline int it_keyeq(const struct iam_iterator *it,
575                            const struct iam_key *k)
576 {
577         return iam_leaf_keyeq(&it->ii_path.ip_leaf, k);
578 }
579
580 static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik)
581 {
582         return iam_ikeycmp(it->ii_path.ip_container,
583                            iam_leaf_ikey(&it->ii_path.ip_leaf,
584                                         iam_path_ikey(&it->ii_path, 0)), ik);
585 }
586
587 static inline int it_at_rec(const struct iam_iterator *it)
588 {
589         return !iam_leaf_at_end(&it->ii_path.ip_leaf);
590 }
591
592 static inline int it_before(const struct iam_iterator *it)
593 {
594         return it_state(it) == IAM_IT_SKEWED && it_at_rec(it);
595 }
596
597 /*
598  * Helper wrapper around iam_it_get(): returns 0 (success) only when record
599  * with exactly the same key as asked is found.
600  */
601 static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k)
602 {
603         int result;
604
605         result = iam_it_get(it, k);
606         if (result > 0)
607                 result = 0;
608         else if (result == 0)
609                 /*
610                  * Return -ENOENT if cursor is located above record with a key
611                  * different from one specified, or in the empty leaf.
612                  *
613                  * XXX returning -ENOENT only works if iam_it_get() never
614                  * returns -ENOENT as a legitimate error.
615                  */
616                 result = -ENOENT;
617         return result;
618 }
619
620 void iam_container_write_lock(struct iam_container *ic)
621 {
622         down_write(&ic->ic_sem);
623 }
624
625 void iam_container_write_unlock(struct iam_container *ic)
626 {
627         up_write(&ic->ic_sem);
628 }
629
630 void iam_container_read_lock(struct iam_container *ic)
631 {
632         down_read(&ic->ic_sem);
633 }
634
635 void iam_container_read_unlock(struct iam_container *ic)
636 {
637         up_read(&ic->ic_sem);
638 }
639
640 /*
641  * Initialize iterator to IAM_IT_DETACHED state.
642  *
643  * postcondition: it_state(it) == IAM_IT_DETACHED
644  */
645 int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
646                  struct iam_path_descr *pd)
647 {
648         memset(it, 0, sizeof *it);
649         it->ii_flags  = flags;
650         it->ii_state  = IAM_IT_DETACHED;
651         iam_path_init(&it->ii_path, c, pd);
652         return 0;
653 }
654
655 /*
656  * Finalize iterator and release all resources.
657  *
658  * precondition: it_state(it) == IAM_IT_DETACHED
659  */
660 void iam_it_fini(struct iam_iterator *it)
661 {
662         assert_corr(it_state(it) == IAM_IT_DETACHED);
663         iam_path_fini(&it->ii_path);
664 }
665
666 /*
667  * this locking primitives are used to protect parts
668  * of dir's htree. protection unit is block: leaf or index
669  */
670 static struct dynlock_handle *iam_lock_htree(struct iam_container *ic,
671                                              unsigned long value,
672                                              enum dynlock_type lt)
673 {
674         return dynlock_lock(&ic->ic_tree_lock, value, lt, GFP_NOFS);
675 }
676
677 static int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh)
678 {
679         struct iam_frame *f;
680
681         for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) {
682                 do_corr(schedule());
683                 *lh = iam_lock_htree(path->ip_container, f->curidx, DLT_READ);
684                 if (*lh == NULL)
685                         return -ENOMEM;
686         }
687         return 0;
688 }
689
690 /*
691  * Fast check for frame consistency.
692  */
693 static int iam_check_fast(struct iam_path *path, struct iam_frame *frame)
694 {
695         struct iam_container *bag;
696         struct iam_entry *next;
697         struct iam_entry *last;
698         struct iam_entry *entries;
699         struct iam_entry *at;
700
701         bag = path->ip_container;
702         at = frame->at;
703         entries = frame->entries;
704         last = iam_entry_shift(path, entries, dx_get_count(entries) - 1);
705
706         if (unlikely(at > last))
707                 return -EAGAIN;
708
709         if (unlikely(dx_get_block(path, at) != frame->leaf))
710                 return -EAGAIN;
711
712         if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at),
713                      path->ip_ikey_target) > 0))
714                 return -EAGAIN;
715
716         next = iam_entry_shift(path, at, +1);
717         if (next <= last) {
718                 if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next),
719                                          path->ip_ikey_target) <= 0))
720                         return -EAGAIN;
721         }
722         return 0;
723 }
724
725 int dx_index_is_compat(struct iam_path *path)
726 {
727         return iam_path_descr(path) == NULL;
728 }
729
730 /*
731  * dx_find_position
732  *
733  * search position of specified hash in index
734  *
735  */
736
737 static struct iam_entry *iam_find_position(struct iam_path *path,
738                                            struct iam_frame *frame)
739 {
740         int count;
741         struct iam_entry *p;
742         struct iam_entry *q;
743         struct iam_entry *m;
744
745         count = dx_get_count(frame->entries);
746         assert_corr(count && count <= dx_get_limit(frame->entries));
747         p = iam_entry_shift(path, frame->entries,
748                             dx_index_is_compat(path) ? 1 : 2);
749         q = iam_entry_shift(path, frame->entries, count - 1);
750         while (p <= q) {
751                 m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2);
752                 if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m),
753                                 path->ip_ikey_target) > 0)
754                         q = iam_entry_shift(path, m, -1);
755                 else
756                         p = iam_entry_shift(path, m, +1);
757         }
758         return iam_entry_shift(path, p, -1);
759 }
760
761
762
763 static iam_ptr_t iam_find_ptr(struct iam_path *path, struct iam_frame *frame)
764 {
765         return dx_get_block(path, iam_find_position(path, frame));
766 }
767
768 void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
769                     const struct iam_ikey *key, iam_ptr_t ptr)
770 {
771         struct iam_entry *entries = frame->entries;
772         struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
773         int count = dx_get_count(entries);
774
775         /*
776          * Unfortunately we cannot assert this, as this function is sometimes
777          * called by VFS under i_sem and without pdirops lock.
778          */
779         assert_corr(1 || iam_frame_is_locked(path, frame));
780         assert_corr(count < dx_get_limit(entries));
781         assert_corr(frame->at < iam_entry_shift(path, entries, count));
782         assert_inv(dx_node_check(path, frame));
783
784         memmove(iam_entry_shift(path, new, 1), new,
785                 (char *)iam_entry_shift(path, entries, count) - (char *)new);
786         dx_set_ikey(path, new, key);
787         dx_set_block(path, new, ptr);
788         dx_set_count(entries, count + 1);
789         assert_inv(dx_node_check(path, frame));
790 }
791
792 void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
793                          const struct iam_ikey *key, iam_ptr_t ptr)
794 {
795         iam_lock_bh(frame->bh);
796         iam_insert_key(path, frame, key, ptr);
797         iam_unlock_bh(frame->bh);
798 }
799 /*
800  * returns 0 if path was unchanged, -EAGAIN otherwise.
801  */
802 static int iam_check_path(struct iam_path *path, struct iam_frame *frame)
803 {
804         int equal;
805
806         iam_lock_bh(frame->bh);
807         equal = iam_check_fast(path, frame) == 0 ||
808                 frame->leaf == iam_find_ptr(path, frame);
809         DX_DEVAL(iam_lock_stats.dls_bh_again += !equal);
810         iam_unlock_bh(frame->bh);
811
812         return equal ? 0 : -EAGAIN;
813 }
814
815 static int iam_lookup_try(struct iam_path *path)
816 {
817         u32 ptr;
818         int err = 0;
819         int i;
820
821         struct iam_descr *param;
822         struct iam_frame *frame;
823         struct iam_container *c;
824
825         param = iam_path_descr(path);
826         c = path->ip_container;
827
828         ptr = param->id_ops->id_root_ptr(c);
829         for (frame = path->ip_frames, i = 0; i <= path->ip_indirect;
830              ++frame, ++i) {
831                 err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
832                                                   &frame->bh);
833                 do_corr(schedule());
834
835                 iam_lock_bh(frame->bh);
836                 /*
837                  * node must be initialized under bh lock because concurrent
838                  * creation procedure may change it and iam_lookup_try() will
839                  * see obsolete tree height. -bzzz
840                  */
841                 if (err != 0)
842                         break;
843
844                 if (LDISKFS_INVARIANT_ON) {
845                         err = param->id_ops->id_node_check(path, frame);
846                         if (err != 0)
847                                 break;
848                 }
849
850                 err = param->id_ops->id_node_load(path, frame);
851                 if (err != 0)
852                         break;
853
854                 assert_inv(dx_node_check(path, frame));
855                 /*
856                  * splitting may change root index block and move hash we're
857                  * looking for into another index block so, we have to check
858                  * this situation and repeat from begining if path got changed
859                  * -bzzz
860                  */
861                 if (i > 0) {
862                         err = iam_check_path(path, frame - 1);
863                         if (err != 0)
864                                 break;
865                 }
866
867                 frame->at = iam_find_position(path, frame);
868                 frame->curidx = ptr;
869                 frame->leaf = ptr = dx_get_block(path, frame->at);
870
871                 iam_unlock_bh(frame->bh);
872                 do_corr(schedule());
873         }
874         if (err != 0)
875                 iam_unlock_bh(frame->bh);
876         path->ip_frame = --frame;
877         return err;
878 }
879
880 static int __iam_path_lookup(struct iam_path *path)
881 {
882         int err;
883         int i;
884
885         for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i)
886                 assert(path->ip_frames[i].bh == NULL);
887
888         do {
889                 err = iam_lookup_try(path);
890                 do_corr(schedule());
891                 if (err != 0)
892                         iam_path_fini(path);
893         } while (err == -EAGAIN);
894
895         return err;
896 }
897
898 /*
899  * returns 0 if path was unchanged, -EAGAIN otherwise.
900  */
901 static int iam_check_full_path(struct iam_path *path, int search)
902 {
903         struct iam_frame *bottom;
904         struct iam_frame *scan;
905         int i;
906         int result;
907
908         do_corr(schedule());
909
910         for (bottom = path->ip_frames, i = 0;
911              i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) {
912                 ; /* find last filled in frame */
913         }
914
915         /*
916          * Lock frames, bottom to top.
917          */
918         for (scan = bottom - 1; scan >= path->ip_frames; --scan)
919                 iam_lock_bh(scan->bh);
920         /*
921          * Check them top to bottom.
922          */
923         result = 0;
924         for (scan = path->ip_frames; scan < bottom; ++scan) {
925                 struct iam_entry *pos;
926
927                 if (search) {
928                         if (iam_check_fast(path, scan) == 0)
929                                 continue;
930
931                         pos = iam_find_position(path, scan);
932                         if (scan->leaf != dx_get_block(path, pos)) {
933                                 result = -EAGAIN;
934                                 break;
935                         }
936                         scan->at = pos;
937                 } else {
938                         pos = iam_entry_shift(path, scan->entries,
939                                               dx_get_count(scan->entries) - 1);
940                         if (scan->at > pos ||
941                             scan->leaf != dx_get_block(path, scan->at)) {
942                                 result = -EAGAIN;
943                                 break;
944                         }
945                 }
946         }
947
948         /*
949          * Unlock top to bottom.
950          */
951         for (scan = path->ip_frames; scan < bottom; ++scan)
952                 iam_unlock_bh(scan->bh);
953         DX_DEVAL(iam_lock_stats.dls_bh_full_again += !!result);
954         do_corr(schedule());
955
956         return result;
957 }
958
959
960 /*
961  * Performs path lookup and returns with found leaf (if any) locked by htree
962  * lock.
963  */
964 static int iam_lookup_lock(struct iam_path *path,
965                            struct dynlock_handle **dl, enum dynlock_type lt)
966 {
967         int result;
968
969         while ((result = __iam_path_lookup(path)) == 0) {
970                 do_corr(schedule());
971                 *dl = iam_lock_htree(path->ip_container, path->ip_frame->leaf,
972                                      lt);
973                 if (*dl == NULL) {
974                         iam_path_fini(path);
975                         result = -ENOMEM;
976                         break;
977                 }
978                 do_corr(schedule());
979                 /*
980                  * while locking leaf we just found may get split so we need
981                  * to check this -bzzz
982                  */
983                 if (iam_check_full_path(path, 1) == 0)
984                         break;
985                 iam_unlock_htree(path->ip_container, *dl);
986                 *dl = NULL;
987                 iam_path_fini(path);
988         }
989         return result;
990 }
991 /*
992  * Performs tree top-to-bottom traversal starting from root, and loads leaf
993  * node.
994  */
995 static int iam_path_lookup(struct iam_path *path, int index)
996 {
997         struct iam_leaf  *leaf;
998         int result;
999
1000         leaf = &path->ip_leaf;
1001         result = iam_lookup_lock(path, &leaf->il_lock, DLT_WRITE);
1002         assert_inv(iam_path_check(path));
1003         do_corr(schedule());
1004         if (result == 0) {
1005                 result = iam_leaf_load(path);
1006                 if (result == 0) {
1007                         do_corr(schedule());
1008                         if (index)
1009                                 result = iam_leaf_ops(leaf)->
1010                                         ilookup(leaf, path->ip_ikey_target);
1011                         else
1012                                 result = iam_leaf_ops(leaf)->
1013                                         lookup(leaf, path->ip_key_target);
1014                         do_corr(schedule());
1015                 }
1016                 if (result < 0)
1017                         iam_leaf_unlock(leaf);
1018         }
1019         return result;
1020 }
1021
1022 /*
1023  * Common part of iam_it_{i,}get().
1024  */
1025 static int __iam_it_get(struct iam_iterator *it, int index)
1026 {
1027         int result;
1028
1029         assert_corr(it_state(it) == IAM_IT_DETACHED);
1030
1031         result = iam_path_lookup(&it->ii_path, index);
1032         if (result >= 0) {
1033                 int collision;
1034
1035                 collision = result & IAM_LOOKUP_LAST;
1036                 switch (result & ~IAM_LOOKUP_LAST) {
1037                 case IAM_LOOKUP_EXACT:
1038                         result = +1;
1039                         it->ii_state = IAM_IT_ATTACHED;
1040                         break;
1041                 case IAM_LOOKUP_OK:
1042                         result = 0;
1043                         it->ii_state = IAM_IT_ATTACHED;
1044                         break;
1045                 case IAM_LOOKUP_BEFORE:
1046                 case IAM_LOOKUP_EMPTY:
1047                         result = 0;
1048                         it->ii_state = IAM_IT_SKEWED;
1049                         break;
1050                 default:
1051                         assert(0);
1052                 }
1053                 result |= collision;
1054         }
1055         /*
1056          * See iam_it_get_exact() for explanation.
1057          */
1058         assert_corr(result != -ENOENT);
1059         return result;
1060 }
1061
1062 /*
1063  * Correct hash, but not the same key was found, iterate through hash
1064  * collision chain, looking for correct record.
1065  */
1066 static int iam_it_collision(struct iam_iterator *it)
1067 {
1068         int result;
1069
1070         assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target)));
1071
1072         while ((result = iam_it_next(it)) == 0) {
1073                 do_corr(schedule());
1074                 if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0)
1075                         return -ENOENT;
1076                 if (it_keyeq(it, it->ii_path.ip_key_target))
1077                         return 0;
1078         }
1079         return result;
1080 }
1081
1082 /*
1083  * Attach iterator. After successful completion, @it points to record with
1084  * least key not larger than @k.
1085  *
1086  * Return value: 0: positioned on existing record,
1087  *             +ve: exact position found,
1088  *             -ve: error.
1089  *
1090  * precondition:  it_state(it) == IAM_IT_DETACHED
1091  * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
1092  *                     it_keycmp(it, k) <= 0)
1093  */
1094 int iam_it_get(struct iam_iterator *it, const struct iam_key *k)
1095 {
1096         int result;
1097
1098         assert_corr(it_state(it) == IAM_IT_DETACHED);
1099
1100         it->ii_path.ip_ikey_target = NULL;
1101         it->ii_path.ip_key_target  = k;
1102
1103         result = __iam_it_get(it, 0);
1104
1105         if (result == IAM_LOOKUP_LAST) {
1106                 result = iam_it_collision(it);
1107                 if (result != 0) {
1108                         iam_it_put(it);
1109                         iam_it_fini(it);
1110                         result = __iam_it_get(it, 0);
1111                 } else
1112                         result = +1;
1113         }
1114         if (result > 0)
1115                 result &= ~IAM_LOOKUP_LAST;
1116
1117         assert_corr(ergo(result > 0, it_keycmp(it, k) == 0));
1118         assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
1119                     it_keycmp(it, k) <= 0));
1120         return result;
1121 }
1122
1123 /*
1124  * Attach iterator by index key.
1125  */
1126 static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k)
1127 {
1128         assert_corr(it_state(it) == IAM_IT_DETACHED);
1129
1130         it->ii_path.ip_ikey_target = k;
1131         return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST;
1132 }
1133
1134 /*
1135  * Attach iterator, and assure it points to the record (not skewed).
1136  *
1137  * Return value: 0: positioned on existing record,
1138  *             +ve: exact position found,
1139  *             -ve: error.
1140  *
1141  * precondition:  it_state(it) == IAM_IT_DETACHED &&
1142  *                !(it->ii_flags&IAM_IT_WRITE)
1143  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
1144  */
1145 int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k)
1146 {
1147         int result;
1148
1149         assert_corr(it_state(it) == IAM_IT_DETACHED &&
1150                     !(it->ii_flags&IAM_IT_WRITE));
1151         result = iam_it_get(it, k);
1152         if (result == 0) {
1153                 if (it_state(it) != IAM_IT_ATTACHED) {
1154                         assert_corr(it_state(it) == IAM_IT_SKEWED);
1155                         result = iam_it_next(it);
1156                 }
1157         }
1158         assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
1159         return result;
1160 }
1161
1162 /*
1163  * Duplicates iterator.
1164  *
1165  * postcondition: it_state(dst) == it_state(src) &&
1166  *                iam_it_container(dst) == iam_it_container(src) &&
1167  *                dst->ii_flags = src->ii_flags &&
1168  *                ergo(it_state(src) == IAM_IT_ATTACHED,
1169  *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
1170  *                     iam_it_key_get(dst) == iam_it_key_get(src))
1171  */
1172 void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src)
1173 {
1174         dst->ii_flags = src->ii_flags;
1175         dst->ii_state = src->ii_state;
1176         /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
1177         /*
1178          * XXX: duplicate lock.
1179          */
1180         assert_corr(it_state(dst) == it_state(src));
1181         assert_corr(iam_it_container(dst) == iam_it_container(src));
1182         assert_corr(dst->ii_flags = src->ii_flags);
1183         assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED,
1184                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
1185                     iam_it_key_get(dst) == iam_it_key_get(src)));
1186 }
1187
1188 /*
1189  * Detach iterator. Does nothing it detached state.
1190  *
1191  * postcondition: it_state(it) == IAM_IT_DETACHED
1192  */
1193 void iam_it_put(struct iam_iterator *it)
1194 {
1195         if (it->ii_state != IAM_IT_DETACHED) {
1196                 it->ii_state = IAM_IT_DETACHED;
1197                 iam_leaf_fini(&it->ii_path.ip_leaf);
1198         }
1199 }
1200
1201 static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
1202                                         struct iam_ikey *ikey);
1203
1204
1205 /*
1206  * This function increments the frame pointer to search the next leaf
1207  * block, and reads in the necessary intervening nodes if the search
1208  * should be necessary.  Whether or not the search is necessary is
1209  * controlled by the hash parameter.  If the hash value is even, then
1210  * the search is only continued if the next block starts with that
1211  * hash value.  This is used if we are searching for a specific file.
1212  *
1213  * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
1214  *
1215  * This function returns 1 if the caller should continue to search,
1216  * or 0 if it should not.  If there is an error reading one of the
1217  * index blocks, it will a negative error code.
1218  *
1219  * If start_hash is non-null, it will be filled in with the starting
1220  * hash of the next page.
1221  */
1222 static int iam_htree_advance(struct inode *dir, __u32 hash,
1223                               struct iam_path *path, __u32 *start_hash,
1224                               int compat)
1225 {
1226         struct iam_frame *p;
1227         struct buffer_head *bh;
1228         int err, num_frames = 0;
1229         __u32 bhash;
1230
1231         p = path->ip_frame;
1232         /*
1233          * Find the next leaf page by incrementing the frame pointer.
1234          * If we run out of entries in the interior node, loop around and
1235          * increment pointer in the parent node.  When we break out of
1236          * this loop, num_frames indicates the number of interior
1237          * nodes need to be read.
1238          */
1239         while (1) {
1240                 do_corr(schedule());
1241                 iam_lock_bh(p->bh);
1242                 if (p->at_shifted)
1243                         p->at_shifted = 0;
1244                 else
1245                         p->at = iam_entry_shift(path, p->at, +1);
1246                 if (p->at < iam_entry_shift(path, p->entries,
1247                                             dx_get_count(p->entries))) {
1248                         p->leaf = dx_get_block(path, p->at);
1249                         iam_unlock_bh(p->bh);
1250                         break;
1251                 }
1252                 iam_unlock_bh(p->bh);
1253                 if (p == path->ip_frames)
1254                         return 0;
1255                 num_frames++;
1256                 --p;
1257         }
1258
1259         if (compat) {
1260                 /*
1261                  * Htree hash magic.
1262                  */
1263
1264                 /*
1265                  * If the hash is 1, then continue only if the next page has a
1266                  * continuation hash of any value.  This is used for readdir
1267                  * handling.  Otherwise, check to see if the hash matches the
1268                  * desired contiuation hash.  If it doesn't, return since
1269                  * there's no point to read in the successive index pages.
1270                  */
1271                 dx_get_ikey(path, p->at, (struct iam_ikey *)&bhash);
1272                 if (start_hash)
1273                         *start_hash = bhash;
1274                 if ((hash & 1) == 0) {
1275                         if ((bhash & ~1) != hash)
1276                                 return 0;
1277                 }
1278         }
1279         /*
1280          * If the hash is HASH_NB_ALWAYS, we always go to the next
1281          * block so no check is necessary
1282          */
1283         while (num_frames--) {
1284                 iam_ptr_t idx;
1285
1286                 do_corr(schedule());
1287                 iam_lock_bh(p->bh);
1288                 idx = p->leaf = dx_get_block(path, p->at);
1289                 iam_unlock_bh(p->bh);
1290                 err = iam_path_descr(path)->id_ops->
1291                         id_node_read(path->ip_container, idx, NULL, &bh);
1292                 if (err != 0)
1293                         return err; /* Failure */
1294                 ++p;
1295                 brelse(p->bh);
1296                 assert_corr(p->bh != bh);
1297                 p->bh = bh;
1298                 p->entries = dx_node_get_entries(path, p);
1299                 p->at = iam_entry_shift(path, p->entries, !compat);
1300                 assert_corr(p->curidx != idx);
1301                 p->curidx = idx;
1302                 iam_lock_bh(p->bh);
1303                 assert_corr(p->leaf != dx_get_block(path, p->at));
1304                 p->leaf = dx_get_block(path, p->at);
1305                 iam_unlock_bh(p->bh);
1306                 assert_inv(dx_node_check(path, p));
1307         }
1308         return 1;
1309 }
1310
1311 static inline int iam_index_advance(struct iam_path *path)
1312 {
1313         return iam_htree_advance(iam_path_obj(path), 0, path, NULL, 0);
1314 }
1315
1316 static void iam_unlock_array(struct iam_container *ic,
1317                              struct dynlock_handle **lh)
1318 {
1319         int i;
1320
1321         for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) {
1322                 if (*lh != NULL) {
1323                         iam_unlock_htree(ic, *lh);
1324                         *lh = NULL;
1325                 }
1326         }
1327 }
1328 /*
1329  * Advance index part of @path to point to the next leaf. Returns 1 on
1330  * success, 0, when end of container was reached. Leaf node is locked.
1331  */
1332 int iam_index_next(struct iam_container *c, struct iam_path *path)
1333 {
1334         iam_ptr_t cursor;
1335         struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { NULL, };
1336         int result;
1337
1338         /*
1339          * Locking for iam_index_next()... is to be described.
1340          */
1341
1342         cursor = path->ip_frame->leaf;
1343
1344         while (1) {
1345                 result = iam_index_lock(path, lh);
1346                 do_corr(schedule());
1347                 if (result < 0)
1348                         break;
1349
1350                 result = iam_check_full_path(path, 0);
1351                 if (result == 0 && cursor == path->ip_frame->leaf) {
1352                         result = iam_index_advance(path);
1353
1354                         assert_corr(result == 0 ||
1355                                     cursor != path->ip_frame->leaf);
1356                         break;
1357                 }
1358                 do {
1359                         iam_unlock_array(c, lh);
1360
1361                         iam_path_release(path);
1362                         do_corr(schedule());
1363
1364                         result = __iam_path_lookup(path);
1365                         if (result < 0)
1366                                 break;
1367
1368                         while (path->ip_frame->leaf != cursor) {
1369                                 do_corr(schedule());
1370
1371                                 result = iam_index_lock(path, lh);
1372                                 do_corr(schedule());
1373                                 if (result < 0)
1374                                         break;
1375
1376                                 result = iam_check_full_path(path, 0);
1377                                 if (result != 0)
1378                                         break;
1379
1380                                 result = iam_index_advance(path);
1381                                 if (result == 0) {
1382                                         CERROR("cannot find cursor : %u\n",
1383                                                 cursor);
1384                                         result = -EIO;
1385                                 }
1386                                 if (result < 0)
1387                                         break;
1388                                 result = iam_check_full_path(path, 0);
1389                                 if (result != 0)
1390                                         break;
1391                                 iam_unlock_array(c, lh);
1392                         }
1393                 } while (result == -EAGAIN);
1394                 if (result < 0)
1395                         break;
1396         }
1397         iam_unlock_array(c, lh);
1398         return result;
1399 }
1400
1401 /*
1402  * Move iterator one record right.
1403  *
1404  * Return value: 0: success,
1405  *              +1: end of container reached
1406  *             -ve: error
1407  *
1408  * precondition:  (it_state(it) == IAM_IT_ATTACHED ||
1409  *                 it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE
1410  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) &&
1411  *                ergo(result >  0, it_state(it) == IAM_IT_DETACHED)
1412  */
1413 int iam_it_next(struct iam_iterator *it)
1414 {
1415         int result;
1416         struct iam_path *path;
1417         struct iam_leaf *leaf;
1418
1419         do_corr(struct iam_ikey *ik_orig);
1420
1421         /* assert_corr(it->ii_flags&IAM_IT_MOVE); */
1422         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1423                     it_state(it) == IAM_IT_SKEWED);
1424
1425         path = &it->ii_path;
1426         leaf = &path->ip_leaf;
1427
1428         assert_corr(iam_leaf_is_locked(leaf));
1429
1430         result = 0;
1431         do_corr(ik_orig = it_at_rec(it) ?
1432                 iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL);
1433         if (it_before(it)) {
1434                 assert_corr(!iam_leaf_at_end(leaf));
1435                 it->ii_state = IAM_IT_ATTACHED;
1436         } else {
1437                 if (!iam_leaf_at_end(leaf))
1438                         /* advance within leaf node */
1439                         iam_leaf_next(leaf);
1440                 /*
1441                  * multiple iterations may be necessary due to empty leaves.
1442                  */
1443                 while (result == 0 && iam_leaf_at_end(leaf)) {
1444                         do_corr(schedule());
1445                         /* advance index portion of the path */
1446                         result = iam_index_next(iam_it_container(it), path);
1447                         assert_corr(iam_leaf_is_locked(leaf));
1448                         if (result == 1) {
1449                                 struct dynlock_handle *lh;
1450                                 lh = iam_lock_htree(iam_it_container(it),
1451                                                     path->ip_frame->leaf,
1452                                                     DLT_WRITE);
1453                                 if (lh != NULL) {
1454                                         iam_leaf_fini(leaf);
1455                                         leaf->il_lock = lh;
1456                                         result = iam_leaf_load(path);
1457                                         if (result == 0)
1458                                                 iam_leaf_start(leaf);
1459                                 } else
1460                                         result = -ENOMEM;
1461                         } else if (result == 0)
1462                                 /* end of container reached */
1463                                 result = +1;
1464                         if (result != 0)
1465                                 iam_it_put(it);
1466                 }
1467                 if (result == 0)
1468                         it->ii_state = IAM_IT_ATTACHED;
1469         }
1470         assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED));
1471         assert_corr(ergo(result >  0, it_state(it) == IAM_IT_DETACHED));
1472         assert_corr(ergo(result == 0 && ik_orig != NULL,
1473                     it_ikeycmp(it, ik_orig) >= 0));
1474         return result;
1475 }
1476
1477 /*
1478  * Return pointer to the record under iterator.
1479  *
1480  * precondition:  it_state(it) == IAM_IT_ATTACHED && it_at_rec(it)
1481  * postcondition: it_state(it) == IAM_IT_ATTACHED
1482  */
1483 struct iam_rec *iam_it_rec_get(const struct iam_iterator *it)
1484 {
1485         assert_corr(it_state(it) == IAM_IT_ATTACHED);
1486         assert_corr(it_at_rec(it));
1487         return iam_leaf_rec(&it->ii_path.ip_leaf);
1488 }
1489
1490 static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r)
1491 {
1492         struct iam_leaf *folio;
1493
1494         folio = &it->ii_path.ip_leaf;
1495         iam_leaf_ops(folio)->rec_set(folio, r);
1496 }
1497
1498 /*
1499  * Replace contents of record under iterator.
1500  *
1501  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
1502  *                it->ii_flags&IAM_IT_WRITE
1503  * postcondition: it_state(it) == IAM_IT_ATTACHED &&
1504  *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
1505  */
1506 int iam_it_rec_set(handle_t *h,
1507                    struct iam_iterator *it, const struct iam_rec *r)
1508 {
1509         int result;
1510         struct iam_path *path;
1511         struct buffer_head *bh;
1512
1513         assert_corr(it_state(it) == IAM_IT_ATTACHED &&
1514                     it->ii_flags&IAM_IT_WRITE);
1515         assert_corr(it_at_rec(it));
1516
1517         path = &it->ii_path;
1518         bh = path->ip_leaf.il_bh;
1519         result = iam_txn_add(h, path, bh);
1520         if (result == 0) {
1521                 iam_it_reccpy(it, r);
1522                 result = iam_txn_dirty(h, path, bh);
1523         }
1524         return result;
1525 }
1526
1527 /*
1528  * Return pointer to the index key under iterator.
1529  *
1530  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1531  *                it_state(it) == IAM_IT_SKEWED
1532  */
1533 static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
1534                                         struct iam_ikey *ikey)
1535 {
1536         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1537                     it_state(it) == IAM_IT_SKEWED);
1538         assert_corr(it_at_rec(it));
1539         return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey);
1540 }
1541
1542 /*
1543  * Return pointer to the key under iterator.
1544  *
1545  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1546  *                it_state(it) == IAM_IT_SKEWED
1547  */
1548 struct iam_key *iam_it_key_get(const struct iam_iterator *it)
1549 {
1550         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1551                     it_state(it) == IAM_IT_SKEWED);
1552         assert_corr(it_at_rec(it));
1553         return iam_leaf_key(&it->ii_path.ip_leaf);
1554 }
1555
1556 /*
1557  * Return size of key under iterator (in bytes)
1558  *
1559  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1560  *                it_state(it) == IAM_IT_SKEWED
1561  */
1562 int iam_it_key_size(const struct iam_iterator *it)
1563 {
1564         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1565                     it_state(it) == IAM_IT_SKEWED);
1566         assert_corr(it_at_rec(it));
1567         return iam_leaf_key_size(&it->ii_path.ip_leaf);
1568 }
1569
1570 static struct buffer_head *
1571 iam_new_node(handle_t *h, struct iam_container *c, iam_ptr_t *b, int *e)
1572 {
1573         struct inode *inode = c->ic_object;
1574         struct buffer_head *bh = NULL;
1575         struct iam_idle_head *head;
1576         struct buffer_head *idle;
1577         __u32 *idle_blocks;
1578         __u16 count;
1579
1580         if (c->ic_idle_bh == NULL)
1581                 goto newblock;
1582
1583         mutex_lock(&c->ic_idle_mutex);
1584         if (unlikely(c->ic_idle_bh == NULL)) {
1585                 mutex_unlock(&c->ic_idle_mutex);
1586                 goto newblock;
1587         }
1588
1589         head = (struct iam_idle_head *)(c->ic_idle_bh->b_data);
1590         count = le16_to_cpu(head->iih_count);
1591         if (count > 0) {
1592                 *e = ldiskfs_journal_get_write_access(h, c->ic_idle_bh);
1593                 if (*e != 0)
1594                         goto fail;
1595
1596                 --count;
1597                 *b = le32_to_cpu(head->iih_blks[count]);
1598                 head->iih_count = cpu_to_le16(count);
1599                 *e = ldiskfs_handle_dirty_metadata(h, inode, c->ic_idle_bh);
1600                 if (*e != 0)
1601                         goto fail;
1602
1603                 mutex_unlock(&c->ic_idle_mutex);
1604                 bh = __ldiskfs_bread(NULL, inode, *b, 0);
1605                 if (IS_ERR_OR_NULL(bh)) {
1606                         if (IS_ERR(bh))
1607                                 *e = PTR_ERR(bh);
1608                         else
1609                                 *e = -EIO;
1610                         return NULL;
1611                 }
1612                 goto got;
1613         }
1614
1615         /* The block itself which contains the iam_idle_head is
1616          * also an idle block, and can be used as the new node. */
1617         idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
1618                                 c->ic_descr->id_root_gap +
1619                                 sizeof(struct dx_countlimit));
1620         *e = ldiskfs_journal_get_write_access(h, c->ic_root_bh);
1621         if (*e != 0)
1622                 goto fail;
1623
1624         *b = le32_to_cpu(*idle_blocks);
1625         iam_lock_bh(c->ic_root_bh);
1626         *idle_blocks = head->iih_next;
1627         iam_unlock_bh(c->ic_root_bh);
1628         *e = ldiskfs_handle_dirty_metadata(h, inode, c->ic_root_bh);
1629         if (*e != 0) {
1630                 iam_lock_bh(c->ic_root_bh);
1631                 *idle_blocks = cpu_to_le32(*b);
1632                 iam_unlock_bh(c->ic_root_bh);
1633                 goto fail;
1634         }
1635
1636         bh = c->ic_idle_bh;
1637         idle = iam_load_idle_blocks(c, le32_to_cpu(*idle_blocks));
1638         if (idle != NULL && IS_ERR(idle)) {
1639                 *e = PTR_ERR(idle);
1640                 c->ic_idle_bh = NULL;
1641                 brelse(bh);
1642                 goto fail;
1643         }
1644
1645         c->ic_idle_bh = idle;
1646         mutex_unlock(&c->ic_idle_mutex);
1647
1648 got:
1649         /* get write access for the found buffer head */
1650         *e = ldiskfs_journal_get_write_access(h, bh);
1651         if (*e != 0) {
1652                 brelse(bh);
1653                 bh = NULL;
1654                 ldiskfs_std_error(inode->i_sb, *e);
1655         } else {
1656                 /* Clear the reused node as new node does. */
1657                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1658                 set_buffer_uptodate(bh);
1659         }
1660         return bh;
1661
1662 newblock:
1663         bh = osd_ldiskfs_append(h, inode, b);
1664         if (IS_ERR(bh)) {
1665                 *e = PTR_ERR(bh);
1666                 bh = NULL;
1667         }
1668
1669         return bh;
1670
1671 fail:
1672         mutex_unlock(&c->ic_idle_mutex);
1673         ldiskfs_std_error(inode->i_sb, *e);
1674         return NULL;
1675 }
1676
1677 /*
1678  * Insertion of new record. Interaction with jbd during non-trivial case (when
1679  * split happens) is as following:
1680  *
1681  *  - new leaf node is involved into transaction by iam_new_node();
1682  *
1683  *  - old leaf node is involved into transaction by iam_add_rec();
1684  *
1685  *  - leaf where insertion point ends in, is marked dirty by iam_add_rec();
1686  *
1687  *  - leaf without insertion point is marked dirty (as @new_leaf) by
1688  *  iam_new_leaf();
1689  *
1690  *  - split index nodes are involved into transaction and marked dirty by
1691  *  split_index_node().
1692  *
1693  *  - "safe" index node, which is no split, but where new pointer is inserted
1694  *  is involved into transaction and marked dirty by split_index_node().
1695  *
1696  *  - index node where pointer to new leaf is inserted is involved into
1697  *  transaction by split_index_node() and marked dirty by iam_add_rec().
1698  *
1699  *  - inode is marked dirty by iam_add_rec().
1700  *
1701  */
1702
1703 static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
1704 {
1705         int err;
1706         iam_ptr_t blknr;
1707         struct buffer_head *new_leaf;
1708         struct buffer_head *old_leaf;
1709         struct iam_container *c;
1710         struct inode *obj;
1711         struct iam_path *path;
1712
1713         c = iam_leaf_container(leaf);
1714         path = leaf->il_path;
1715
1716         obj = c->ic_object;
1717         new_leaf = iam_new_node(handle, c, &blknr, &err);
1718         do_corr(schedule());
1719         if (new_leaf != NULL) {
1720                 struct dynlock_handle *lh;
1721
1722                 lh = iam_lock_htree(c, blknr, DLT_WRITE);
1723                 do_corr(schedule());
1724                 if (lh != NULL) {
1725                         iam_leaf_ops(leaf)->init_new(c, new_leaf);
1726                         do_corr(schedule());
1727                         old_leaf = leaf->il_bh;
1728                         iam_leaf_split(leaf, &new_leaf, blknr);
1729                         if (old_leaf != leaf->il_bh) {
1730                                 /*
1731                                  * Switched to the new leaf.
1732                                  */
1733                                 iam_leaf_unlock(leaf);
1734                                 leaf->il_lock = lh;
1735                                 path->ip_frame->leaf = blknr;
1736                         } else
1737                                 iam_unlock_htree(path->ip_container, lh);
1738                         do_corr(schedule());
1739                         err = iam_txn_dirty(handle, path, new_leaf);
1740                         if (err == 0)
1741                                 err = ldiskfs_mark_inode_dirty(handle, obj);
1742                         do_corr(schedule());
1743                 } else
1744                         err = -ENOMEM;
1745                 brelse(new_leaf);
1746         }
1747         assert_inv(iam_path_check(iam_leaf_path(leaf)));
1748         return err;
1749 }
1750
1751 static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
1752 {
1753         ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
1754 }
1755
1756 static int iam_shift_entries(struct iam_path *path,
1757                          struct iam_frame *frame, unsigned count,
1758                          struct iam_entry *entries, struct iam_entry *entries2,
1759                          u32 newblock)
1760 {
1761         unsigned count1;
1762         unsigned count2;
1763         int delta;
1764
1765         struct iam_frame *parent = frame - 1;
1766         struct iam_ikey *pivot = iam_path_ikey(path, 3);
1767
1768         delta = dx_index_is_compat(path) ? 0 : +1;
1769
1770         count1 = count/2 + delta;
1771         count2 = count - count1;
1772         dx_get_ikey(path, iam_entry_shift(path, entries, count1), pivot);
1773
1774         dxtrace(printk("Split index %d/%d\n", count1, count2));
1775
1776         memcpy((char *) iam_entry_shift(path, entries2, delta),
1777                (char *) iam_entry_shift(path, entries, count1),
1778                count2 * iam_entry_size(path));
1779
1780         dx_set_count(entries2, count2 + delta);
1781         dx_set_limit(entries2, dx_node_limit(path));
1782
1783         /*
1784          * NOTE: very subtle piece of code competing dx_probe() may find 2nd
1785          * level index in root index, then we insert new index here and set
1786          * new count in that 2nd level index. so, dx_probe() may see 2nd level
1787          * index w/o hash it looks for. the solution is to check root index
1788          * after we locked just founded 2nd level index -bzzz
1789          */
1790         iam_insert_key_lock(path, parent, pivot, newblock);
1791
1792         /*
1793          * now old and new 2nd level index blocks contain all pointers, so
1794          * dx_probe() may find it in the both.  it's OK -bzzz
1795          */
1796         iam_lock_bh(frame->bh);
1797         dx_set_count(entries, count1);
1798         iam_unlock_bh(frame->bh);
1799
1800         /*
1801          * now old 2nd level index block points to first half of leafs. it's
1802          * importand that dx_probe() must check root index block for changes
1803          * under dx_lock_bh(frame->bh) -bzzz
1804          */
1805
1806         return count1;
1807 }
1808
1809
1810 int split_index_node(handle_t *handle, struct iam_path *path,
1811                      struct dynlock_handle **lh)
1812 {
1813         struct iam_entry *entries;   /* old block contents */
1814         struct iam_entry *entries2;  /* new block contents */
1815         struct iam_frame *frame, *safe;
1816         struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {NULL};
1817         u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
1818         struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,};
1819         struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,};
1820         struct inode *dir = iam_path_obj(path);
1821         struct iam_descr *descr;
1822         int nr_splet;
1823         int i, err;
1824
1825         descr = iam_path_descr(path);
1826         /*
1827          * Algorithm below depends on this.
1828          */
1829         assert_corr(dx_root_limit(path) < dx_node_limit(path));
1830
1831         frame = path->ip_frame;
1832         entries = frame->entries;
1833
1834         /*
1835          * Tall-tree handling: we might have to split multiple index blocks
1836          * all the way up to tree root. Tricky point here is error handling:
1837          * to avoid complicated undo/rollback we
1838          *
1839          *   - first allocate all necessary blocks
1840          *
1841          *   - insert pointers into them atomically.
1842          */
1843
1844         /*
1845          * Locking: leaf is already locked. htree-locks are acquired on all
1846          * index nodes that require split bottom-to-top, on the "safe" node,
1847          * and on all new nodes
1848          */
1849
1850         dxtrace(printk("using %u of %u node entries\n",
1851                        dx_get_count(entries), dx_get_limit(entries)));
1852
1853         /* What levels need split? */
1854         for (nr_splet = 0; frame >= path->ip_frames &&
1855              dx_get_count(frame->entries) == dx_get_limit(frame->entries);
1856              --frame, ++nr_splet) {
1857                 do_corr(schedule());
1858                 if (nr_splet == DX_MAX_TREE_HEIGHT) {
1859                         /*
1860                          * CWARN(dir->i_sb, __FUNCTION__,
1861                          * "Directory index full!\n");
1862                          */
1863                         err = -ENOSPC;
1864                         goto cleanup;
1865                 }
1866         }
1867
1868         safe = frame;
1869
1870         /*
1871          * Lock all nodes, bottom to top.
1872          */
1873         for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) {
1874                 do_corr(schedule());
1875                 lock[i] = iam_lock_htree(path->ip_container, frame->curidx,
1876                                          DLT_WRITE);
1877                 if (lock[i] == NULL) {
1878                         err = -ENOMEM;
1879                         goto cleanup;
1880                 }
1881         }
1882
1883         /*
1884          * Check for concurrent index modification.
1885          */
1886         err = iam_check_full_path(path, 1);
1887         if (err)
1888                 goto cleanup;
1889         /*
1890          * And check that the same number of nodes is to be split.
1891          */
1892         for (i = 0, frame = path->ip_frame; frame >= path->ip_frames &&
1893              dx_get_count(frame->entries) == dx_get_limit(frame->entries);
1894              --frame, ++i) {
1895                 ;
1896         }
1897         if (i != nr_splet) {
1898                 err = -EAGAIN;
1899                 goto cleanup;
1900         }
1901
1902         /*
1903          * Go back down, allocating blocks, locking them, and adding into
1904          * transaction...
1905          */
1906         for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
1907                 bh_new[i] = iam_new_node(handle, path->ip_container,
1908                                          &newblock[i], &err);
1909                 do_corr(schedule());
1910                 if (!bh_new[i] ||
1911                     descr->id_ops->id_node_init(path->ip_container,
1912                                                 bh_new[i], 0) != 0)
1913                         goto cleanup;
1914
1915                 new_lock[i] = iam_lock_htree(path->ip_container, newblock[i],
1916                                              DLT_WRITE);
1917                 if (new_lock[i] == NULL) {
1918                         err = -ENOMEM;
1919                         goto cleanup;
1920                 }
1921                 do_corr(schedule());
1922                 BUFFER_TRACE(frame->bh, "get_write_access");
1923                 err = ldiskfs_journal_get_write_access(handle, frame->bh);
1924                 if (err)
1925                         goto journal_error;
1926         }
1927         /* Add "safe" node to transaction too */
1928         if (safe + 1 != path->ip_frames) {
1929                 do_corr(schedule());
1930                 err = ldiskfs_journal_get_write_access(handle, safe->bh);
1931                 if (err)
1932                         goto journal_error;
1933         }
1934
1935         /* Go through nodes once more, inserting pointers */
1936         for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
1937                 unsigned count;
1938                 int idx;
1939                 struct buffer_head *bh2;
1940                 struct buffer_head *bh;
1941
1942                 entries = frame->entries;
1943                 count = dx_get_count(entries);
1944                 idx = iam_entry_diff(path, frame->at, entries);
1945
1946                 bh2 = bh_new[i];
1947                 entries2 = dx_get_entries(path, bh2->b_data, 0);
1948
1949                 bh = frame->bh;
1950                 if (frame == path->ip_frames) {
1951                         /* splitting root node. Tricky point:
1952                          *
1953                          * In the "normal" B-tree we'd split root *and* add
1954                          * new root to the tree with pointers to the old root
1955                          * and its sibling (thus introducing two new nodes).
1956                          *
1957                          * In htree it's enough to add one node, because
1958                          * capacity of the root node is smaller than that of
1959                          * non-root one.
1960                          */
1961                         struct iam_frame *frames;
1962                         struct iam_entry *next;
1963
1964                         assert_corr(i == 0);
1965
1966                         do_corr(schedule());
1967
1968                         frames = path->ip_frames;
1969                         memcpy((char *) entries2, (char *) entries,
1970                                count * iam_entry_size(path));
1971                         dx_set_limit(entries2, dx_node_limit(path));
1972
1973                         /* Set up root */
1974                         iam_lock_bh(frame->bh);
1975                         next = descr->id_ops->id_root_inc(path->ip_container,
1976                                                           path, frame);
1977                         dx_set_block(path, next, newblock[0]);
1978                         iam_unlock_bh(frame->bh);
1979
1980                         do_corr(schedule());
1981                         /* Shift frames in the path */
1982                         memmove(frames + 2, frames + 1,
1983                                (sizeof path->ip_frames) - 2 * sizeof frames[0]);
1984                         /* Add new access path frame */
1985                         frames[1].at = iam_entry_shift(path, entries2, idx);
1986                         frames[1].entries = entries = entries2;
1987                         frames[1].bh = bh2;
1988                         assert_inv(dx_node_check(path, frame));
1989                         ++ path->ip_frame;
1990                         ++ frame;
1991                         assert_inv(dx_node_check(path, frame));
1992                         bh_new[0] = NULL; /* buffer head is "consumed" */
1993                         err = ldiskfs_handle_dirty_metadata(handle, NULL, bh2);
1994                         if (err)
1995                                 goto journal_error;
1996                         do_corr(schedule());
1997                 } else {
1998                         /* splitting non-root index node. */
1999                         struct iam_frame *parent = frame - 1;
2000
2001                         do_corr(schedule());
2002                         count = iam_shift_entries(path, frame, count,
2003                                                 entries, entries2, newblock[i]);
2004                         /* Which index block gets the new entry? */
2005                         if (idx >= count) {
2006                                 int d = dx_index_is_compat(path) ? 0 : +1;
2007
2008                                 frame->at = iam_entry_shift(path, entries2,
2009                                                             idx - count + d);
2010                                 frame->entries = entries = entries2;
2011                                 frame->curidx = newblock[i];
2012                                 swap(frame->bh, bh2);
2013                                 assert_corr(lock[i + 1] != NULL);
2014                                 assert_corr(new_lock[i] != NULL);
2015                                 swap(lock[i + 1], new_lock[i]);
2016                                 bh_new[i] = bh2;
2017                                 parent->at = iam_entry_shift(path,
2018                                                              parent->at, +1);
2019                         }
2020                         assert_inv(dx_node_check(path, frame));
2021                         assert_inv(dx_node_check(path, parent));
2022                         dxtrace(dx_show_index("node", frame->entries));
2023                         dxtrace(dx_show_index("node",
2024                                 ((struct dx_node *) bh2->b_data)->entries));
2025                         err = ldiskfs_handle_dirty_metadata(handle, NULL, bh2);
2026                         if (err)
2027                                 goto journal_error;
2028                         do_corr(schedule());
2029                         err = ldiskfs_handle_dirty_metadata(handle, NULL,
2030                                                             parent->bh);
2031                         if (err)
2032                                 goto journal_error;
2033                 }
2034                 do_corr(schedule());
2035                 err = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
2036                 if (err)
2037                         goto journal_error;
2038         }
2039                 /*
2040                  * This function was called to make insertion of new leaf
2041                  * possible. Check that it fulfilled its obligations.
2042                  */
2043                 assert_corr(dx_get_count(path->ip_frame->entries) <
2044                             dx_get_limit(path->ip_frame->entries));
2045         assert_corr(lock[nr_splet] != NULL);
2046         *lh = lock[nr_splet];
2047         lock[nr_splet] = NULL;
2048         if (nr_splet > 0) {
2049                 /*
2050                  * Log ->i_size modification.
2051                  */
2052                 err = ldiskfs_mark_inode_dirty(handle, dir);
2053                 if (err)
2054                         goto journal_error;
2055         }
2056         goto cleanup;
2057 journal_error:
2058         ldiskfs_std_error(dir->i_sb, err);
2059
2060 cleanup:
2061         iam_unlock_array(path->ip_container, lock);
2062         iam_unlock_array(path->ip_container, new_lock);
2063
2064         assert_corr(err || iam_frame_is_locked(path, path->ip_frame));
2065
2066         do_corr(schedule());
2067         for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
2068                 if (bh_new[i] != NULL)
2069                         brelse(bh_new[i]);
2070         }
2071         return err;
2072 }
2073
2074 static int iam_add_rec(handle_t *handle, struct iam_iterator *it,
2075                        struct iam_path *path,
2076                        const struct iam_key *k, const struct iam_rec *r)
2077 {
2078         int err;
2079         struct iam_leaf *leaf;
2080
2081         leaf = &path->ip_leaf;
2082         assert_inv(iam_path_check(path));
2083         err = iam_txn_add(handle, path, leaf->il_bh);
2084         if (err == 0) {
2085                 do_corr(schedule());
2086                 if (!iam_leaf_can_add(leaf, k, r)) {
2087                         struct dynlock_handle *lh = NULL;
2088
2089                         do {
2090                                 assert_corr(lh == NULL);
2091                                 do_corr(schedule());
2092                                 err = split_index_node(handle, path, &lh);
2093                                 if (err == -EAGAIN) {
2094                                         assert_corr(lh == NULL);
2095
2096                                         iam_path_fini(path);
2097                                         it->ii_state = IAM_IT_DETACHED;
2098
2099                                         do_corr(schedule());
2100                                         err = iam_it_get_exact(it, k);
2101                                         if (err == -ENOENT)
2102                                                 err = +1; /* repeat split */
2103                                         else if (err == 0)
2104                                                 err = -EEXIST;
2105                                 }
2106                         } while (err > 0);
2107                         assert_inv(iam_path_check(path));
2108                         if (err == 0) {
2109                                 assert_corr(lh != NULL);
2110                                 do_corr(schedule());
2111                                 err = iam_new_leaf(handle, leaf);
2112                                 if (err == 0)
2113                                         err = iam_txn_dirty(handle, path,
2114                                                             path->ip_frame->bh);
2115                         }
2116                         iam_unlock_htree(path->ip_container, lh);
2117                         do_corr(schedule());
2118                 }
2119                 if (err == 0) {
2120                         iam_leaf_rec_add(leaf, k, r);
2121                         err = iam_txn_dirty(handle, path, leaf->il_bh);
2122                 }
2123         }
2124         assert_inv(iam_path_check(path));
2125         return err;
2126 }
2127
2128 /*
2129  * Insert new record with key @k and contents from @r, shifting records to the
2130  * right. On success, iterator is positioned on the newly inserted record.
2131  *
2132  * precondition: it->ii_flags&IAM_IT_WRITE &&
2133  *               (it_state(it) == IAM_IT_ATTACHED ||
2134  *                it_state(it) == IAM_IT_SKEWED) &&
2135  *               ergo(it_state(it) == IAM_IT_ATTACHED,
2136  *                    it_keycmp(it, k) <= 0) &&
2137  *               ergo(it_before(it), it_keycmp(it, k) > 0));
2138  * postcondition: ergo(result == 0,
2139  *                     it_state(it) == IAM_IT_ATTACHED &&
2140  *                     it_keycmp(it, k) == 0 &&
2141  *                     !memcmp(iam_it_rec_get(it), r, ...))
2142  */
2143 int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
2144                       const struct iam_key *k, const struct iam_rec *r)
2145 {
2146         int result;
2147         struct iam_path *path;
2148
2149         path = &it->ii_path;
2150
2151         assert_corr(it->ii_flags&IAM_IT_WRITE);
2152         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
2153                     it_state(it) == IAM_IT_SKEWED);
2154         assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED,
2155                     it_keycmp(it, k) <= 0));
2156         assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0));
2157         result = iam_add_rec(h, it, path, k, r);
2158         if (result == 0)
2159                 it->ii_state = IAM_IT_ATTACHED;
2160         assert_corr(ergo(result == 0,
2161                          it_state(it) == IAM_IT_ATTACHED &&
2162                          it_keycmp(it, k) == 0));
2163         return result;
2164 }
2165
2166 static inline int iam_idle_blocks_limit(struct inode *inode)
2167 {
2168         return (inode->i_sb->s_blocksize - sizeof(struct iam_idle_head)) >> 2;
2169 }
2170
2171 /*
2172  * If the leaf cannnot be recycled, we will lose one block for reusing.
2173  * It is not a serious issue because it almost the same of non-recycle.
2174  */
2175 static iam_ptr_t iam_index_shrink(handle_t *h, struct iam_path *p,
2176                                   struct iam_leaf *l, struct buffer_head **bh)
2177 {
2178         struct iam_container *c = p->ip_container;
2179         struct inode *inode = c->ic_object;
2180         struct iam_frame *frame = p->ip_frame;
2181         struct iam_entry *entries;
2182         struct iam_entry *pos;
2183         struct dynlock_handle *lh;
2184         int count;
2185         int rc;
2186
2187         if (c->ic_idle_failed)
2188                 return 0;
2189
2190         if (unlikely(frame == NULL))
2191                 return 0;
2192
2193         if (!iam_leaf_empty(l))
2194                 return 0;
2195
2196         lh = iam_lock_htree(c, frame->curidx, DLT_WRITE);
2197         if (lh == NULL) {
2198                 CWARN("%s: No memory to recycle idle blocks\n",
2199                       osd_ino2name(inode));
2200                 return 0;
2201         }
2202
2203         rc = iam_txn_add(h, p, frame->bh);
2204         if (rc != 0) {
2205                 iam_unlock_htree(c, lh);
2206                 return 0;
2207         }
2208
2209         iam_lock_bh(frame->bh);
2210         entries = frame->entries;
2211         count = dx_get_count(entries);
2212         /*
2213          * NOT shrink the last entry in the index node, which can be reused
2214          * directly by next new node.
2215          */
2216         if (count == 2) {
2217                 iam_unlock_bh(frame->bh);
2218                 iam_unlock_htree(c, lh);
2219                 return 0;
2220         }
2221
2222         pos = iam_find_position(p, frame);
2223         /*
2224          * There may be some new leaf nodes have been added or empty leaf nodes
2225          * have been shrinked during my delete operation.
2226          *
2227          * If the empty leaf is not under current index node because the index
2228          * node has been split, then just skip the empty leaf, which is rare.
2229          */
2230         if (unlikely(frame->leaf != dx_get_block(p, pos))) {
2231                 iam_unlock_bh(frame->bh);
2232                 iam_unlock_htree(c, lh);
2233                 return 0;
2234         }
2235
2236         frame->at = pos;
2237         if (frame->at < iam_entry_shift(p, entries, count - 1)) {
2238                 struct iam_entry *n = iam_entry_shift(p, frame->at, 1);
2239
2240                 memmove(frame->at, n,
2241                         (char *)iam_entry_shift(p, entries, count) - (char *)n);
2242                 frame->at_shifted = 1;
2243         }
2244         dx_set_count(entries, count - 1);
2245         iam_unlock_bh(frame->bh);
2246         rc = iam_txn_dirty(h, p, frame->bh);
2247         iam_unlock_htree(c, lh);
2248         if (rc != 0)
2249                 return 0;
2250
2251         get_bh(l->il_bh);
2252         *bh = l->il_bh;
2253         return frame->leaf;
2254 }
2255
2256 static int
2257 iam_install_idle_blocks(handle_t *h, struct iam_path *p, struct buffer_head *bh,
2258                         __u32 *idle_blocks, iam_ptr_t blk)
2259 {
2260         struct iam_container *c = p->ip_container;
2261         struct buffer_head *old = c->ic_idle_bh;
2262         struct iam_idle_head *head;
2263         int rc;
2264
2265         head = (struct iam_idle_head *)(bh->b_data);
2266         head->iih_magic = cpu_to_le16(IAM_IDLE_HEADER_MAGIC);
2267         head->iih_count = 0;
2268         head->iih_next = *idle_blocks;
2269         /* The bh already get_write_accessed. */
2270         rc = iam_txn_dirty(h, p, bh);
2271         if (rc != 0)
2272                 return rc;
2273
2274         rc = iam_txn_add(h, p, c->ic_root_bh);
2275         if (rc != 0)
2276                 return rc;
2277
2278         iam_lock_bh(c->ic_root_bh);
2279         *idle_blocks = cpu_to_le32(blk);
2280         iam_unlock_bh(c->ic_root_bh);
2281         rc = iam_txn_dirty(h, p, c->ic_root_bh);
2282         if (rc == 0) {
2283                 /* NOT release old before new assigned. */
2284                 get_bh(bh);
2285                 c->ic_idle_bh = bh;
2286                 brelse(old);
2287         } else {
2288                 iam_lock_bh(c->ic_root_bh);
2289                 *idle_blocks = head->iih_next;
2290                 iam_unlock_bh(c->ic_root_bh);
2291         }
2292         return rc;
2293 }
2294
2295 /*
2296  * If the leaf cannnot be recycled, we will lose one block for reusing.
2297  * It is not a serious issue because it almost the same of non-recycle.
2298  */
2299 static void iam_recycle_leaf(handle_t *h, struct iam_path *p,
2300                              struct buffer_head *bh, iam_ptr_t blk)
2301 {
2302         struct iam_container *c = p->ip_container;
2303         struct inode *inode = c->ic_object;
2304         struct iam_idle_head *head;
2305         __u32 *idle_blocks;
2306         int count;
2307         int rc;
2308
2309         mutex_lock(&c->ic_idle_mutex);
2310         if (unlikely(c->ic_idle_failed)) {
2311                 rc = -EFAULT;
2312                 goto unlock;
2313         }
2314
2315         idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
2316                                 c->ic_descr->id_root_gap +
2317                                 sizeof(struct dx_countlimit));
2318         /* It is the first idle block. */
2319         if (c->ic_idle_bh == NULL) {
2320                 rc = iam_install_idle_blocks(h, p, bh, idle_blocks, blk);
2321                 goto unlock;
2322         }
2323
2324         head = (struct iam_idle_head *)(c->ic_idle_bh->b_data);
2325         count = le16_to_cpu(head->iih_count);
2326         /* Current ic_idle_bh is full, to be replaced by the leaf. */
2327         if (count == iam_idle_blocks_limit(inode)) {
2328                 rc = iam_install_idle_blocks(h, p, bh, idle_blocks, blk);
2329                 goto unlock;
2330         }
2331
2332         /* Just add to ic_idle_bh. */
2333         rc = iam_txn_add(h, p, c->ic_idle_bh);
2334         if (rc != 0)
2335                 goto unlock;
2336
2337         head->iih_blks[count] = cpu_to_le32(blk);
2338         head->iih_count = cpu_to_le16(count + 1);
2339         rc = iam_txn_dirty(h, p, c->ic_idle_bh);
2340
2341 unlock:
2342         mutex_unlock(&c->ic_idle_mutex);
2343         if (rc != 0)
2344                 CWARN("%s: idle blocks failed, will lose the blk %u\n",
2345                       osd_ino2name(inode), blk);
2346 }
2347
2348 /*
2349  * Delete record under iterator.
2350  *
2351  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
2352  *                it->ii_flags&IAM_IT_WRITE &&
2353  *                it_at_rec(it)
2354  * postcondition: it_state(it) == IAM_IT_ATTACHED ||
2355  *                it_state(it) == IAM_IT_DETACHED
2356  */
2357 int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
2358 {
2359         int result;
2360         struct iam_leaf *leaf;
2361         struct iam_path *path;
2362
2363         assert_corr(it_state(it) == IAM_IT_ATTACHED &&
2364                     it->ii_flags&IAM_IT_WRITE);
2365         assert_corr(it_at_rec(it));
2366
2367         path = &it->ii_path;
2368         leaf = &path->ip_leaf;
2369
2370         assert_inv(iam_path_check(path));
2371
2372         result = iam_txn_add(h, path, leaf->il_bh);
2373         /*
2374          * no compaction for now.
2375          */
2376         if (result == 0) {
2377                 iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE);
2378                 result = iam_txn_dirty(h, path, leaf->il_bh);
2379                 if (result == 0 && iam_leaf_at_end(leaf)) {
2380                         struct buffer_head *bh = NULL;
2381                         iam_ptr_t blk;
2382
2383                         blk = iam_index_shrink(h, path, leaf, &bh);
2384                         if (it->ii_flags & IAM_IT_MOVE) {
2385                                 result = iam_it_next(it);
2386                                 if (result > 0)
2387                                         result = 0;
2388                         }
2389
2390                         if (bh != NULL) {
2391                                 iam_recycle_leaf(h, path, bh, blk);
2392                                 brelse(bh);
2393                         }
2394                 }
2395         }
2396         assert_inv(iam_path_check(path));
2397         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
2398                     it_state(it) == IAM_IT_DETACHED);
2399         return result;
2400 }
2401
2402 /*
2403  * Convert iterator to cookie.
2404  *
2405  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
2406  *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
2407  * postcondition: it_state(it) == IAM_IT_ATTACHED
2408  */
2409 iam_pos_t iam_it_store(const struct iam_iterator *it)
2410 {
2411         iam_pos_t result;
2412
2413         assert_corr(it_state(it) == IAM_IT_ATTACHED);
2414         assert_corr(it_at_rec(it));
2415         assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <=
2416                     sizeof result);
2417
2418         result = 0;
2419         return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result);
2420 }
2421
2422 /*
2423  * Restore iterator from cookie.
2424  *
2425  * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
2426  *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
2427  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
2428  *                                  iam_it_store(it) == pos)
2429  */
2430 int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
2431 {
2432         assert_corr(it_state(it) == IAM_IT_DETACHED &&
2433                 it->ii_flags&IAM_IT_MOVE);
2434         assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos);
2435         return iam_it_iget(it, (struct iam_ikey *)&pos);
2436 }
2437
2438 /***********************************************************************/
2439 /* invariants                                                          */
2440 /***********************************************************************/
2441
2442 static inline int ptr_inside(void *base, size_t size, void *ptr)
2443 {
2444         return (base <= ptr) && (ptr < base + size);
2445 }
2446
2447 static int iam_frame_invariant(struct iam_frame *f)
2448 {
2449         return
2450                 (f->bh != NULL &&
2451                 f->bh->b_data != NULL &&
2452                 ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
2453                 ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
2454                 f->entries <= f->at);
2455 }
2456
2457 static int iam_leaf_invariant(struct iam_leaf *l)
2458 {
2459         return
2460                 l->il_bh != NULL &&
2461                 l->il_bh->b_data != NULL &&
2462                 ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
2463                 ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
2464                 l->il_entries <= l->il_at;
2465 }
2466
2467 static int iam_path_invariant(struct iam_path *p)
2468 {
2469         int i;
2470
2471         if (p->ip_container == NULL ||
2472             p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
2473             p->ip_frame != p->ip_frames + p->ip_indirect ||
2474             !iam_leaf_invariant(&p->ip_leaf))
2475                 return 0;
2476         for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
2477                 if (i <= p->ip_indirect) {
2478                         if (!iam_frame_invariant(&p->ip_frames[i]))
2479                                 return 0;
2480                 }
2481         }
2482         return 1;
2483 }
2484
2485 int iam_it_invariant(struct iam_iterator *it)
2486 {
2487         return
2488                 (it->ii_state == IAM_IT_DETACHED ||
2489                 it->ii_state == IAM_IT_ATTACHED ||
2490                 it->ii_state == IAM_IT_SKEWED) &&
2491                 !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
2492                 ergo(it->ii_state == IAM_IT_ATTACHED ||
2493                 it->ii_state == IAM_IT_SKEWED,
2494                 iam_path_invariant(&it->ii_path) &&
2495                 equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED));
2496 }
2497
2498 /*
2499  * Search container @c for record with key @k. If record is found, its data
2500  * are moved into @r.
2501  *
2502  * Return values: 0: found, -ENOENT: not-found, -ve: error
2503  */
2504 int iam_lookup(struct iam_container *c, const struct iam_key *k,
2505                struct iam_rec *r, struct iam_path_descr *pd)
2506 {
2507         struct iam_iterator it;
2508         int result;
2509
2510         iam_it_init(&it, c, 0, pd);
2511
2512         result = iam_it_get_exact(&it, k);
2513         if (result == 0)
2514                 /*
2515                  * record with required key found, copy it into user buffer
2516                  */
2517                 iam_reccpy(&it.ii_path.ip_leaf, r);
2518         iam_it_put(&it);
2519         iam_it_fini(&it);
2520         return result;
2521 }
2522
2523 /*
2524  * Insert new record @r with key @k into container @c (within context of
2525  * transaction @h).
2526  *
2527  * Return values: 0: success, -ve: error, including -EEXIST when record with
2528  * given key is already present.
2529  *
2530  * postcondition: ergo(result == 0 || result == -EEXIST,
2531  *                                  iam_lookup(c, k, r2) > 0;
2532  */
2533 int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k,
2534                const struct iam_rec *r, struct iam_path_descr *pd)
2535 {
2536         struct iam_iterator it;
2537         int result;
2538
2539         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2540
2541         result = iam_it_get_exact(&it, k);
2542         if (result == -ENOENT)
2543                 result = iam_it_rec_insert(h, &it, k, r);
2544         else if (result == 0)
2545                 result = -EEXIST;
2546         iam_it_put(&it);
2547         iam_it_fini(&it);
2548         return result;
2549 }
2550
2551 /*
2552  * Update record with the key @k in container @c (within context of
2553  * transaction @h), new record is given by @r.
2554  *
2555  * Return values: +1: skip because of the same rec value, 0: success,
2556  * -ve: error, including -ENOENT if no record with the given key found.
2557  */
2558 int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
2559                const struct iam_rec *r, struct iam_path_descr *pd)
2560 {
2561         struct iam_iterator it;
2562         struct iam_leaf *folio;
2563         int result;
2564
2565         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2566
2567         result = iam_it_get_exact(&it, k);
2568         if (result == 0) {
2569                 folio = &it.ii_path.ip_leaf;
2570                 result = iam_leaf_ops(folio)->rec_eq(folio, r);
2571                 if (result == 0)
2572                         iam_it_rec_set(h, &it, r);
2573                 else
2574                         result = 1;
2575         }
2576         iam_it_put(&it);
2577         iam_it_fini(&it);
2578         return result;
2579 }
2580
2581 /*
2582  * Delete existing record with key @k.
2583  *
2584  * Return values: 0: success, -ENOENT: not-found, -ve: other error.
2585  *
2586  * postcondition: ergo(result == 0 || result == -ENOENT,
2587  *                                 !iam_lookup(c, k, *));
2588  */
2589 int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
2590                struct iam_path_descr *pd)
2591 {
2592         struct iam_iterator it;
2593         int result;
2594
2595         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2596
2597         result = iam_it_get_exact(&it, k);
2598         if (result == 0)
2599                 iam_it_rec_delete(h, &it);
2600         iam_it_put(&it);
2601         iam_it_fini(&it);
2602         return result;
2603 }
2604
2605 int iam_root_limit(int rootgap, int blocksize, int size)
2606 {
2607         int limit;
2608         int nlimit;
2609
2610         limit = (blocksize - rootgap) / size;
2611         nlimit = blocksize / size;
2612         if (limit == nlimit)
2613                 limit--;
2614         return limit;
2615 }