Whamcloud - gitweb
b=18504
[fs/lustre-release.git] / lustre / osd / osd_iam.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see [sun.com URL with a
20  * copy of GPLv2].
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * iam.c
37  * Top-level entry points into iam module
38  *
39  * Author: Wang Di <wangdi@clusterfs.com>
40  * Author: Nikita Danilov <nikita@clusterfs.com>
41  */
42
43 /*
44  * iam: big theory statement.
45  *
46  * iam (Index Access Module) is a module providing abstraction of persistent
47  * transactional container on top of generalized ldiskfs htree.
48  *
49  * iam supports:
50  *
51  *     - key, pointer, and record size specifiable per container.
52  *
53  *     - trees taller than 2 index levels.
54  *
55  *     - read/write to existing ldiskfs htree directories as iam containers.
56  *
57  * iam container is a tree, consisting of leaf nodes containing keys and
58  * records stored in this container, and index nodes, containing keys and
59  * pointers to leaf or index nodes.
60  *
61  * iam does not work with keys directly, instead it calls user-supplied key
62  * comparison function (->dpo_keycmp()).
63  *
64  * Pointers are (currently) interpreted as logical offsets (measured in
65  * blocksful) within underlying flat file on top of which iam tree lives.
66  *
67  * On-disk format:
68  *
69  * iam mostly tries to reuse existing htree formats.
70  *
71  * Format of index node:
72  *
73  * +-----+-------+-------+-------+------+-------+------------+
74  * |     | count |       |       |      |       |            |
75  * | gap |   /   | entry | entry | .... | entry | free space |
76  * |     | limit |       |       |      |       |            |
77  * +-----+-------+-------+-------+------+-------+------------+
78  *
79  *       gap           this part of node is never accessed by iam code. It
80  *                     exists for binary compatibility with ldiskfs htree (that,
81  *                     in turn, stores fake struct ext2_dirent for ext2
82  *                     compatibility), and to keep some unspecified per-node
83  *                     data. Gap can be different for root and non-root index
84  *                     nodes. Gap size can be specified for each container
85  *                     (gap of 0 is allowed).
86  *
87  *       count/limit   current number of entries in this node, and the maximal
88  *                     number of entries that can fit into node. count/limit
89  *                     has the same size as entry, and is itself counted in
90  *                     count.
91  *
92  *       entry         index entry: consists of a key immediately followed by
93  *                     a pointer to a child node. Size of a key and size of a
94  *                     pointer depends on container. Entry has neither
95  *                     alignment nor padding.
96  *
97  *       free space    portion of node new entries are added to
98  *
99  * Entries in index node are sorted by their key value.
100  *
101  * Format of a leaf node is not specified. Generic iam code accesses leaf
102  * nodes through ->id_leaf methods in struct iam_descr.
103  *
104  */
105
106 #include <linux/module.h>
107 #include <linux/fs.h>
108 #include <linux/pagemap.h>
109 #include <linux/jbd.h>
110 #include <linux/time.h>
111 #include <linux/ldiskfs_fs.h>
112 #include <linux/ldiskfs_jbd.h>
113 #include <linux/fcntl.h>
114 #include <linux/stat.h>
115 #include <linux/string.h>
116 #include <linux/quotaops.h>
117 #include <linux/buffer_head.h>
118 #include <linux/smp_lock.h>
119 #include "osd_internal.h"
120
121 #include "xattr.h"
122 #include "iopen.h"
123 #include "acl.h"
124
125 /*
126  * List of all registered formats.
127  *
128  * No locking. Callers synchronize.
129  */
130 static LIST_HEAD(iam_formats);
131
132 void iam_format_register(struct iam_format *fmt)
133 {
134         list_add(&fmt->if_linkage, &iam_formats);
135 }
136 EXPORT_SYMBOL(iam_format_register);
137
138 /*
139  * Determine format of given container. This is done by scanning list of
140  * registered formats and calling ->if_guess() method of each in turn.
141  */
142 static int iam_format_guess(struct iam_container *c)
143 {
144         int result;
145         struct iam_format *fmt;
146
147         /*
148          * XXX temporary initialization hook.
149          */
150         {
151                 static int initialized = 0;
152
153                 if (!initialized) {
154                         iam_lvar_format_init();
155                         iam_lfix_format_init();
156                         initialized = 1;
157                 }
158         }
159
160         result = -ENOENT;
161         list_for_each_entry(fmt, &iam_formats, if_linkage) {
162                 result = fmt->if_guess(c);
163                 if (result == 0)
164                         break;
165         }
166         return result;
167 }
168
169 /*
170  * Initialize container @c.
171  */
172 int iam_container_init(struct iam_container *c,
173                        struct iam_descr *descr, struct inode *inode)
174 {
175         memset(c, 0, sizeof *c);
176         c->ic_descr  = descr;
177         c->ic_object = inode;
178         init_rwsem(&c->ic_sem);
179         return 0;
180 }
181 EXPORT_SYMBOL(iam_container_init);
182
183 /*
184  * Determine container format.
185  */
186 int iam_container_setup(struct iam_container *c)
187 {
188         return iam_format_guess(c);
189 }
190 EXPORT_SYMBOL(iam_container_setup);
191
192 /*
193  * Finalize container @c, release all resources.
194  */
195 void iam_container_fini(struct iam_container *c)
196 {
197 }
198 EXPORT_SYMBOL(iam_container_fini);
199
200 void iam_path_init(struct iam_path *path, struct iam_container *c,
201                    struct iam_path_descr *pd)
202 {
203         memset(path, 0, sizeof *path);
204         path->ip_container = c;
205         path->ip_frame = path->ip_frames;
206         path->ip_data = pd;
207         path->ip_leaf.il_path = path;
208 }
209
210 static void iam_leaf_fini(struct iam_leaf *leaf);
211
212 void iam_path_release(struct iam_path *path)
213 {
214         int i;
215
216         for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
217                 if (path->ip_frames[i].bh != NULL) {
218                         brelse(path->ip_frames[i].bh);
219                         path->ip_frames[i].bh = NULL;
220                 }
221         }
222 }
223
224 void iam_path_fini(struct iam_path *path)
225 {
226         iam_leaf_fini(&path->ip_leaf);
227         iam_path_release(path);
228 }
229
230
231 void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
232 {
233         int i;
234
235         path->ipc_hinfo = &path->ipc_hinfo_area;
236         for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
237                 path->ipc_descr.ipd_key_scratch[i] =
238                         (struct iam_ikey *)&path->ipc_scratch[i];
239
240         iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
241 }
242
243 void iam_path_compat_fini(struct iam_path_compat *path)
244 {
245         iam_path_fini(&path->ipc_path);
246 }
247
248 /*
249  * Helper function initializing iam_path_descr and its key scratch area.
250  */
251 struct iam_path_descr *iam_ipd_alloc(void *area, int keysize)
252 {
253         struct iam_path_descr *ipd;
254         void *karea;
255         int i;
256
257         ipd = area;
258         karea = ipd + 1;
259         for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize)
260                 ipd->ipd_key_scratch[i] = karea;
261         return ipd;
262 }
263 EXPORT_SYMBOL(iam_ipd_alloc);
264
265 void iam_ipd_free(struct iam_path_descr *ipd)
266 {
267 }
268 EXPORT_SYMBOL(iam_ipd_free);
269
270 int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
271                   handle_t *h, struct buffer_head **bh)
272 {
273         int result = 0;
274
275         *bh = ldiskfs_bread(h, c->ic_object, (int)ptr, 0, &result);
276         if (*bh == NULL)
277                 result = -EIO;
278         return result;
279 }
280
281 /*
282  * Return pointer to current leaf record. Pointer is valid while corresponding
283  * leaf node is locked and pinned.
284  */
285 static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf)
286 {
287         return iam_leaf_ops(leaf)->rec(leaf);
288 }
289
290 /*
291  * Return pointer to the current leaf key. This function returns pointer to
292  * the key stored in node.
293  *
294  * Caller should assume that returned pointer is only valid while leaf node is
295  * pinned and locked.
296  */
297 static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf)
298 {
299         return iam_leaf_ops(leaf)->key(leaf);
300 }
301
302 static int iam_leaf_key_size(const struct iam_leaf *leaf)
303 {
304         return iam_leaf_ops(leaf)->key_size(leaf);
305 }
306
307 static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf,
308                                       struct iam_ikey *key)
309 {
310         return iam_leaf_ops(leaf)->ikey(leaf, key);
311 }
312
313 static int iam_leaf_keycmp(const struct iam_leaf *leaf,
314                            const struct iam_key *key)
315 {
316         return iam_leaf_ops(leaf)->key_cmp(leaf, key);
317 }
318
319 static int iam_leaf_keyeq(const struct iam_leaf *leaf,
320                           const struct iam_key *key)
321 {
322         return iam_leaf_ops(leaf)->key_eq(leaf, key);
323 }
324
325 #if LDISKFS_INVARIANT_ON
326 static int iam_leaf_check(struct iam_leaf *leaf);
327 extern int dx_node_check(struct iam_path *p, struct iam_frame *f);
328
329 static int iam_path_check(struct iam_path *p)
330 {
331         int i;
332         int result;
333         struct iam_frame *f;
334         struct iam_descr *param;
335
336         result = 1;
337         param = iam_path_descr(p);
338         for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) {
339                 f = &p->ip_frames[i];
340                 if (f->bh != NULL) {
341                         result = dx_node_check(p, f);
342                         if (result)
343                                 result = !param->id_ops->id_node_check(p, f);
344                 }
345         }
346         if (result && p->ip_leaf.il_bh != NULL)
347                 result = iam_leaf_check(&p->ip_leaf);
348         if (result == 0) {
349                 ldiskfs_std_error(iam_path_obj(p)->i_sb, result);
350         }
351         return result;
352 }
353 #endif
354
355 static int iam_leaf_load(struct iam_path *path)
356 {
357         iam_ptr_t block;
358         int err;
359         struct iam_container *c;
360         struct buffer_head   *bh;
361         struct iam_leaf      *leaf;
362         struct iam_descr     *descr;
363
364         c     = path->ip_container;
365         leaf  = &path->ip_leaf;
366         descr = iam_path_descr(path);
367         block = path->ip_frame->leaf;
368         if (block == 0) {
369                 /* XXX bug 11027 */
370                 printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n",
371                        (long unsigned)path->ip_frame->leaf,
372                        dx_get_count(dx_node_get_entries(path, path->ip_frame)),
373                        path->ip_frames[0].bh, path->ip_frames[1].bh,
374                        path->ip_frames[2].bh);
375         }
376         err   = descr->id_ops->id_node_read(c, block, NULL, &bh);
377         if (err == 0) {
378                 leaf->il_bh = bh;
379                 leaf->il_curidx = block;
380                 err = iam_leaf_ops(leaf)->init(leaf);
381                 assert_inv(ergo(err == 0, iam_leaf_check(leaf)));
382         }
383         return err;
384 }
385
386 static void iam_unlock_htree(struct inode *dir, struct dynlock_handle *lh)
387 {
388         if (lh != NULL)
389                 dynlock_unlock(&LDISKFS_I(dir)->i_htree_lock, lh);
390 }
391
392
393 static void iam_leaf_unlock(struct iam_leaf *leaf)
394 {
395         if (leaf->il_lock != NULL) {
396                 iam_unlock_htree(iam_leaf_container(leaf)->ic_object,
397                                 leaf->il_lock);
398                 do_corr(schedule());
399                 leaf->il_lock = NULL;
400         }
401 }
402
403 static void iam_leaf_fini(struct iam_leaf *leaf)
404 {
405         if (leaf->il_path != NULL) {
406                 iam_leaf_unlock(leaf);
407                 assert_inv(ergo(leaf->il_bh != NULL, iam_leaf_check(leaf)));
408                 iam_leaf_ops(leaf)->fini(leaf);
409                 if (leaf->il_bh) {
410                         brelse(leaf->il_bh);
411                         leaf->il_bh = NULL;
412                         leaf->il_curidx = 0;
413                 }
414         }
415 }
416
417 static void iam_leaf_start(struct iam_leaf *folio)
418 {
419         iam_leaf_ops(folio)->start(folio);
420 }
421
422 void iam_leaf_next(struct iam_leaf *folio)
423 {
424         iam_leaf_ops(folio)->next(folio);
425 }
426
427 static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key,
428                              const struct iam_rec *rec)
429 {
430         iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
431 }
432
433 static void iam_rec_del(struct iam_leaf *leaf, int shift)
434 {
435         iam_leaf_ops(leaf)->rec_del(leaf, shift);
436 }
437
438 int iam_leaf_at_end(const struct iam_leaf *leaf)
439 {
440         return iam_leaf_ops(leaf)->at_end(leaf);
441 }
442
443 void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh, iam_ptr_t nr)
444 {
445         iam_leaf_ops(l)->split(l, bh, nr);
446 }
447
448 int iam_leaf_can_add(const struct iam_leaf *l,
449                      const struct iam_key *k, const struct iam_rec *r)
450 {
451         return iam_leaf_ops(l)->can_add(l, k, r);
452 }
453
454 #if LDISKFS_INVARIANT_ON
455 static int iam_leaf_check(struct iam_leaf *leaf)
456 {
457         return 1;
458 #if 0
459         struct iam_lentry    *orig;
460         struct iam_path      *path;
461         struct iam_container *bag;
462         struct iam_ikey       *k0;
463         struct iam_ikey       *k1;
464         int result;
465         int first;
466
467         orig = leaf->il_at;
468         path = iam_leaf_path(leaf);
469         bag  = iam_leaf_container(leaf);
470
471         result = iam_leaf_ops(leaf)->init(leaf);
472         if (result != 0)
473                 return result;
474
475         first = 1;
476         iam_leaf_start(leaf);
477         k0 = iam_path_ikey(path, 0);
478         k1 = iam_path_ikey(path, 1);
479         while (!iam_leaf_at_end(leaf)) {
480                 iam_ikeycpy(bag, k0, k1);
481                 iam_ikeycpy(bag, k1, iam_leaf_ikey(leaf, k1));
482                 if (!first && iam_ikeycmp(bag, k0, k1) > 0) {
483                         return 0;
484                 }
485                 first = 0;
486                 iam_leaf_next(leaf);
487         }
488         leaf->il_at = orig;
489         return 1;
490 #endif
491 }
492 #endif
493
494 static int iam_txn_dirty(handle_t *handle,
495                          struct iam_path *path, struct buffer_head *bh)
496 {
497         int result;
498
499         result = ldiskfs_journal_dirty_metadata(handle, bh);
500         if (result != 0)
501                 ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
502         return result;
503 }
504
505 static int iam_txn_add(handle_t *handle,
506                        struct iam_path *path, struct buffer_head *bh)
507 {
508         int result;
509
510         result = ldiskfs_journal_get_write_access(handle, bh);
511         if (result != 0)
512                 ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
513         return result;
514 }
515
516 /***********************************************************************/
517 /* iterator interface                                                  */
518 /***********************************************************************/
519
520 static enum iam_it_state it_state(const struct iam_iterator *it)
521 {
522         return it->ii_state;
523 }
524
525 /*
526  * Helper function returning scratch key.
527  */
528 static struct iam_container *iam_it_container(const struct iam_iterator *it)
529 {
530         return it->ii_path.ip_container;
531 }
532
533 static inline int it_keycmp(const struct iam_iterator *it,
534                             const struct iam_key *k)
535 {
536         return iam_leaf_keycmp(&it->ii_path.ip_leaf, k);
537 }
538
539 static inline int it_keyeq(const struct iam_iterator *it,
540                            const struct iam_key *k)
541 {
542         return iam_leaf_keyeq(&it->ii_path.ip_leaf, k);
543 }
544
545 static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik)
546 {
547         return iam_ikeycmp(it->ii_path.ip_container,
548                            iam_leaf_ikey(&it->ii_path.ip_leaf,
549                                          iam_path_ikey(&it->ii_path, 0)), ik);
550 }
551
552 static inline int it_at_rec(const struct iam_iterator *it)
553 {
554         return !iam_leaf_at_end(&it->ii_path.ip_leaf);
555 }
556
557 static inline int it_before(const struct iam_iterator *it)
558 {
559         return it_state(it) == IAM_IT_SKEWED && it_at_rec(it);
560 }
561
562 /*
563  * Helper wrapper around iam_it_get(): returns 0 (success) only when record
564  * with exactly the same key as asked is found.
565  */
566 static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k)
567 {
568         int result;
569
570         result = iam_it_get(it, k);
571         if (result > 0)
572                 result = 0;
573         else if (result == 0)
574                 /*
575                  * Return -ENOENT if cursor is located above record with a key
576                  * different from one specified, or in the empty leaf.
577                  *
578                  * XXX returning -ENOENT only works if iam_it_get() never
579                  * returns -ENOENT as a legitimate error.
580                  */
581                 result = -ENOENT;
582         return result;
583 }
584
585 void iam_container_write_lock(struct iam_container *ic)
586 {
587         down_write(&ic->ic_sem);
588 }
589
590 void iam_container_write_unlock(struct iam_container *ic)
591 {
592         up_write(&ic->ic_sem);
593 }
594
595 void iam_container_read_lock(struct iam_container *ic)
596 {
597         down_read(&ic->ic_sem);
598 }
599
600 void iam_container_read_unlock(struct iam_container *ic)
601 {
602         up_read(&ic->ic_sem);
603 }
604
605 /*
606  * Initialize iterator to IAM_IT_DETACHED state.
607  *
608  * postcondition: it_state(it) == IAM_IT_DETACHED
609  */
610 int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
611                  struct iam_path_descr *pd)
612 {
613         memset(it, 0, sizeof *it);
614         it->ii_flags  = flags;
615         it->ii_state  = IAM_IT_DETACHED;
616         iam_path_init(&it->ii_path, c, pd);
617         return 0;
618 }
619 EXPORT_SYMBOL(iam_it_init);
620
621 /*
622  * Finalize iterator and release all resources.
623  *
624  * precondition: it_state(it) == IAM_IT_DETACHED
625  */
626 void iam_it_fini(struct iam_iterator *it)
627 {
628         assert_corr(it_state(it) == IAM_IT_DETACHED);
629         iam_path_fini(&it->ii_path);
630 }
631 EXPORT_SYMBOL(iam_it_fini);
632
633 /*
634  * this locking primitives are used to protect parts
635  * of dir's htree. protection unit is block: leaf or index
636  */
637 struct dynlock_handle *iam_lock_htree(struct inode *dir, unsigned long value,
638                                      enum dynlock_type lt)
639 {
640         return dynlock_lock(&LDISKFS_I(dir)->i_htree_lock, value, lt, GFP_NOFS);
641 }
642
643
644
645 int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh)
646 {
647         struct iam_frame *f;
648
649         for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) {
650                 do_corr(schedule());
651                 *lh = iam_lock_htree(iam_path_obj(path), f->curidx, DLT_READ);
652                 if (*lh == NULL)
653                         return -ENOMEM;
654         }
655         return 0;
656 }
657
658 /*
659  * Fast check for frame consistency.
660  */
661 static int iam_check_fast(struct iam_path *path, struct iam_frame *frame)
662 {
663         struct iam_container *bag;
664         struct iam_entry *next;
665         struct iam_entry *last;
666         struct iam_entry *entries;
667         struct iam_entry *at;
668
669         bag     = path->ip_container;
670         at      = frame->at;
671         entries = frame->entries;
672         last    = iam_entry_shift(path, entries, dx_get_count(entries) - 1);
673
674         if (unlikely(at > last))
675                 return -EAGAIN;
676
677         if (unlikely(dx_get_block(path, at) != frame->leaf))
678                 return -EAGAIN;
679
680         if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at),
681                                  path->ip_ikey_target) > 0))
682                 return -EAGAIN;
683
684         next = iam_entry_shift(path, at, +1);
685         if (next <= last) {
686                 if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next),
687                                          path->ip_ikey_target) <= 0))
688                         return -EAGAIN;
689         }
690         return 0;
691 }
692
693 int dx_index_is_compat(struct iam_path *path)
694 {
695         return iam_path_descr(path) == NULL;
696 }
697
698 /*
699  * dx_find_position
700  *
701  * search position of specified hash in index
702  *
703  */
704
705 struct iam_entry *iam_find_position(struct iam_path *path,
706                                    struct iam_frame *frame)
707 {
708         int count;
709         struct iam_entry *p;
710         struct iam_entry *q;
711         struct iam_entry *m;
712
713         count = dx_get_count(frame->entries);
714         assert_corr(count && count <= dx_get_limit(frame->entries));
715         p = iam_entry_shift(path, frame->entries,
716                             dx_index_is_compat(path) ? 1 : 2);
717         q = iam_entry_shift(path, frame->entries, count - 1);
718         while (p <= q) {
719                 m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2);
720                 if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m),
721                                 path->ip_ikey_target) > 0)
722                         q = iam_entry_shift(path, m, -1);
723                 else
724                         p = iam_entry_shift(path, m, +1);
725         }
726         return iam_entry_shift(path, p, -1);
727 }
728
729
730
731 static iam_ptr_t iam_find_ptr(struct iam_path *path, struct iam_frame *frame)
732 {
733         return dx_get_block(path, iam_find_position(path, frame));
734 }
735
736 void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
737                     const struct iam_ikey *key, iam_ptr_t ptr)
738 {
739         struct iam_entry *entries = frame->entries;
740         struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
741         int count = dx_get_count(entries);
742
743         /*
744          * Unfortunately we cannot assert this, as this function is sometimes
745          * called by VFS under i_sem and without pdirops lock.
746          */
747         assert_corr(1 || iam_frame_is_locked(path, frame));
748         assert_corr(count < dx_get_limit(entries));
749         assert_corr(frame->at < iam_entry_shift(path, entries, count));
750         assert_inv(dx_node_check(path, frame));
751
752         memmove(iam_entry_shift(path, new, 1), new,
753                 (char *)iam_entry_shift(path, entries, count) - (char *)new);
754         dx_set_ikey(path, new, key);
755         dx_set_block(path, new, ptr);
756         dx_set_count(entries, count + 1);
757         assert_inv(dx_node_check(path, frame));
758 }
759
760 void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
761                          const struct iam_ikey *key, iam_ptr_t ptr)
762 {
763         iam_lock_bh(frame->bh);
764         iam_insert_key(path, frame, key, ptr);
765         iam_unlock_bh(frame->bh);
766 }
767 /*
768  * returns 0 if path was unchanged, -EAGAIN otherwise.
769  */
770 static int iam_check_path(struct iam_path *path, struct iam_frame *frame)
771 {
772         int equal;
773
774         iam_lock_bh(frame->bh);
775         equal = iam_check_fast(path, frame) == 0 ||
776                 frame->leaf == iam_find_ptr(path, frame);
777         DX_DEVAL(iam_lock_stats.dls_bh_again += !equal);
778         iam_unlock_bh(frame->bh);
779
780         return equal ? 0 : -EAGAIN;
781 }
782
783 static int iam_lookup_try(struct iam_path *path)
784 {
785         u32 ptr;
786         int err = 0;
787         int i;
788
789         struct iam_descr *param;
790         struct iam_frame *frame;
791         struct iam_container *c;
792
793         param = iam_path_descr(path);
794         c = path->ip_container;
795
796         ptr = param->id_ops->id_root_ptr(c);
797         for (frame = path->ip_frames, i = 0; i <= path->ip_indirect;
798              ++frame, ++i) {
799                 err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
800                                                   &frame->bh);
801                 do_corr(schedule());
802
803                 iam_lock_bh(frame->bh);
804                 /*
805                  * node must be initialized under bh lock because concurrent
806                  * creation procedure may change it and iam_lookup_try() will
807                  * see obsolete tree height. -bzzz
808                  */
809                 if (err != 0)
810                         break;
811
812                 if (LDISKFS_INVARIANT_ON) {
813                         err = param->id_ops->id_node_check(path, frame);
814                         if (err != 0)
815                                 break;
816                 }
817
818                 err = param->id_ops->id_node_load(path, frame);
819                 if (err != 0)
820                         break;
821
822                 assert_inv(dx_node_check(path, frame));
823                 /*
824                  * splitting may change root index block and move hash we're
825                  * looking for into another index block so, we have to check
826                  * this situation and repeat from begining if path got changed
827                  * -bzzz
828                  */
829                 if (i > 0) {
830                         err = iam_check_path(path, frame - 1);
831                         if (err != 0)
832                                 break;
833                 }
834
835                 frame->at = iam_find_position(path, frame);
836                 frame->curidx = ptr;
837                 frame->leaf = ptr = dx_get_block(path, frame->at);
838
839                 iam_unlock_bh(frame->bh);
840                 do_corr(schedule());
841         }
842         if (err != 0)
843                 iam_unlock_bh(frame->bh);
844         path->ip_frame = --frame;
845         return err;
846 }
847
848 static int __iam_path_lookup(struct iam_path *path)
849 {
850         int err;
851         int i;
852
853         for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i)
854                 assert(path->ip_frames[i].bh == NULL);
855
856         do {
857                 err = iam_lookup_try(path);
858                 do_corr(schedule());
859                 if (err != 0)
860                         iam_path_fini(path);
861         } while (err == -EAGAIN);
862
863         return err;
864 }
865
866 /*
867  * returns 0 if path was unchanged, -EAGAIN otherwise.
868  */
869 static int iam_check_full_path(struct iam_path *path, int search)
870 {
871         struct iam_frame *bottom;
872         struct iam_frame *scan;
873         int i;
874         int result;
875
876         do_corr(schedule());
877
878         for (bottom = path->ip_frames, i = 0;
879              i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) {
880                 ; /* find last filled in frame */
881         }
882
883         /*
884          * Lock frames, bottom to top.
885          */
886         for (scan = bottom - 1; scan >= path->ip_frames; --scan)
887                 iam_lock_bh(scan->bh);
888         /*
889          * Check them top to bottom.
890          */
891         result = 0;
892         for (scan = path->ip_frames; scan < bottom; ++scan) {
893                 struct iam_entry *pos;
894
895                 if (search) {
896                         if (iam_check_fast(path, scan) == 0)
897                                 continue;
898
899                         pos = iam_find_position(path, scan);
900                         if (scan->leaf != dx_get_block(path, pos)) {
901                                 result = -EAGAIN;
902                                 break;
903                         }
904                         scan->at = pos;
905                 } else {
906                         pos = iam_entry_shift(path, scan->entries,
907                                               dx_get_count(scan->entries) - 1);
908                         if (scan->at > pos ||
909                             scan->leaf != dx_get_block(path, scan->at)) {
910                                 result = -EAGAIN;
911                                 break;
912                         }
913                 }
914         }
915
916         /*
917          * Unlock top to bottom.
918          */
919         for (scan = path->ip_frames; scan < bottom; ++scan)
920                 iam_unlock_bh(scan->bh);
921         DX_DEVAL(iam_lock_stats.dls_bh_full_again += !!result);
922         do_corr(schedule());
923
924         return result;
925 }
926
927
928 /*
929  * Performs path lookup and returns with found leaf (if any) locked by htree
930  * lock.
931  */
932 int iam_lookup_lock(struct iam_path *path,
933                    struct dynlock_handle **dl, enum dynlock_type lt)
934 {
935         int result;
936         struct inode *dir;
937
938         dir = iam_path_obj(path);
939         while ((result = __iam_path_lookup(path)) == 0) {
940                 do_corr(schedule());
941                 *dl = iam_lock_htree(dir, path->ip_frame->leaf, lt);
942                 if (*dl == NULL) {
943                         iam_path_fini(path);
944                         result = -ENOMEM;
945                         break;
946                 }
947                 do_corr(schedule());
948                 /*
949                  * while locking leaf we just found may get split so we need
950                  * to check this -bzzz
951                  */
952                 if (iam_check_full_path(path, 1) == 0)
953                         break;
954                 iam_unlock_htree(dir, *dl);
955                 *dl = NULL;
956                 iam_path_fini(path);
957         }
958         return result;
959 }
960 /*
961  * Performs tree top-to-bottom traversal starting from root, and loads leaf
962  * node.
963  */
964 static int iam_path_lookup(struct iam_path *path, int index)
965 {
966         struct iam_container *c;
967         struct iam_descr *descr;
968         struct iam_leaf  *leaf;
969         int result;
970
971         c = path->ip_container;
972         leaf = &path->ip_leaf;
973         descr = iam_path_descr(path);
974         result = iam_lookup_lock(path, &leaf->il_lock, DLT_WRITE);
975         assert_inv(iam_path_check(path));
976         do_corr(schedule());
977         if (result == 0) {
978                 result = iam_leaf_load(path);
979                 assert_inv(ergo(result == 0, iam_leaf_check(leaf)));
980                 if (result == 0) {
981                         do_corr(schedule());
982                         if (index)
983                                 result = iam_leaf_ops(leaf)->
984                                         ilookup(leaf, path->ip_ikey_target);
985                         else
986                                 result = iam_leaf_ops(leaf)->
987                                         lookup(leaf, path->ip_key_target);
988                         do_corr(schedule());
989                 }
990                 if (result < 0)
991                         iam_leaf_unlock(leaf);
992         }
993         return result;
994 }
995
996 /*
997  * Common part of iam_it_{i,}get().
998  */
999 static int __iam_it_get(struct iam_iterator *it, int index)
1000 {
1001         int result;
1002         assert_corr(it_state(it) == IAM_IT_DETACHED);
1003
1004         result = iam_path_lookup(&it->ii_path, index);
1005         if (result >= 0) {
1006                 int collision;
1007
1008                 collision = result & IAM_LOOKUP_LAST;
1009                 switch (result & ~IAM_LOOKUP_LAST) {
1010                 case IAM_LOOKUP_EXACT:
1011                         result = +1;
1012                         it->ii_state = IAM_IT_ATTACHED;
1013                         break;
1014                 case IAM_LOOKUP_OK:
1015                         result = 0;
1016                         it->ii_state = IAM_IT_ATTACHED;
1017                         break;
1018                 case IAM_LOOKUP_BEFORE:
1019                 case IAM_LOOKUP_EMPTY:
1020                         result = 0;
1021                         it->ii_state = IAM_IT_SKEWED;
1022                         break;
1023                 default:
1024                         assert(0);
1025                 }
1026                 result |= collision;
1027         }
1028         /*
1029          * See iam_it_get_exact() for explanation.
1030          */
1031         assert_corr(result != -ENOENT);
1032         return result;
1033 }
1034
1035 /*
1036  * Correct hash, but not the same key was found, iterate through hash
1037  * collision chain, looking for correct record.
1038  */
1039 static int iam_it_collision(struct iam_iterator *it)
1040 {
1041         int result;
1042
1043         assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target)));
1044
1045         while ((result = iam_it_next(it)) == 0) {
1046                 do_corr(schedule());
1047                 if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0)
1048                         return -ENOENT;
1049                 if (it_keyeq(it, it->ii_path.ip_key_target))
1050                         return 0;
1051         }
1052         return result;
1053 }
1054
1055 /*
1056  * Attach iterator. After successful completion, @it points to record with
1057  * least key not larger than @k.
1058  *
1059  * Return value: 0: positioned on existing record,
1060  *             +ve: exact position found,
1061  *             -ve: error.
1062  *
1063  * precondition:  it_state(it) == IAM_IT_DETACHED
1064  * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
1065  *                     it_keycmp(it, k) <= 0)
1066  */
1067 int iam_it_get(struct iam_iterator *it, const struct iam_key *k)
1068 {
1069         int result;
1070         assert_corr(it_state(it) == IAM_IT_DETACHED);
1071
1072         it->ii_path.ip_ikey_target = NULL;
1073         it->ii_path.ip_key_target  = k;
1074
1075         result = __iam_it_get(it, 0);
1076
1077         if (result == IAM_LOOKUP_LAST) {
1078                 result = iam_it_collision(it);
1079                 if (result != 0) {
1080                         iam_it_put(it);
1081                         iam_it_fini(it);
1082                         result = __iam_it_get(it, 0);
1083                 } else
1084                         result = +1;
1085         }
1086         if (result > 0)
1087                 result &= ~IAM_LOOKUP_LAST;
1088
1089         assert_corr(ergo(result > 0, it_keycmp(it, k) == 0));
1090         assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
1091                          it_keycmp(it, k) <= 0));
1092         return result;
1093 }
1094 EXPORT_SYMBOL(iam_it_get);
1095
1096 /*
1097  * Attach iterator by index key.
1098  */
1099 static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k)
1100 {
1101         assert_corr(it_state(it) == IAM_IT_DETACHED);
1102
1103         it->ii_path.ip_ikey_target = k;
1104         return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST;
1105 }
1106
1107 /*
1108  * Attach iterator, and assure it points to the record (not skewed).
1109  *
1110  * Return value: 0: positioned on existing record,
1111  *             +ve: exact position found,
1112  *             -ve: error.
1113  *
1114  * precondition:  it_state(it) == IAM_IT_DETACHED &&
1115  *                !(it->ii_flags&IAM_IT_WRITE)
1116  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
1117  */
1118 int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k)
1119 {
1120         int result;
1121         assert_corr(it_state(it) == IAM_IT_DETACHED &&
1122                     !(it->ii_flags&IAM_IT_WRITE));
1123         result = iam_it_get(it, k);
1124         if (result == 0) {
1125                 if (it_state(it) != IAM_IT_ATTACHED) {
1126                         assert_corr(it_state(it) == IAM_IT_SKEWED);
1127                         result = iam_it_next(it);
1128                 }
1129         }
1130         assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
1131         return result;
1132 }
1133 EXPORT_SYMBOL(iam_it_get_at);
1134
1135 /*
1136  * Duplicates iterator.
1137  *
1138  * postcondition: it_state(dst) == it_state(src) &&
1139  *                iam_it_container(dst) == iam_it_container(src) &&
1140  *                dst->ii_flags = src->ii_flags &&
1141  *                ergo(it_state(src) == IAM_IT_ATTACHED,
1142  *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
1143  *                     iam_it_key_get(dst) == iam_it_key_get(src))
1144  */
1145 void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src)
1146 {
1147         dst->ii_flags     = src->ii_flags;
1148         dst->ii_state     = src->ii_state;
1149         /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
1150         /*
1151          * XXX: duplicate lock.
1152          */
1153         assert_corr(it_state(dst) == it_state(src));
1154         assert_corr(iam_it_container(dst) == iam_it_container(src));
1155         assert_corr(dst->ii_flags = src->ii_flags);
1156         assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED,
1157                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
1158                     iam_it_key_get(dst) == iam_it_key_get(src)));
1159
1160 }
1161
1162 /*
1163  * Detach iterator. Does nothing it detached state.
1164  *
1165  * postcondition: it_state(it) == IAM_IT_DETACHED
1166  */
1167 void iam_it_put(struct iam_iterator *it)
1168 {
1169         if (it->ii_state != IAM_IT_DETACHED) {
1170                 it->ii_state = IAM_IT_DETACHED;
1171                 iam_leaf_fini(&it->ii_path.ip_leaf);
1172         }
1173 }
1174 EXPORT_SYMBOL(iam_it_put);
1175
1176 static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
1177                                         struct iam_ikey *ikey);
1178
1179
1180 /*
1181  * This function increments the frame pointer to search the next leaf
1182  * block, and reads in the necessary intervening nodes if the search
1183  * should be necessary.  Whether or not the search is necessary is
1184  * controlled by the hash parameter.  If the hash value is even, then
1185  * the search is only continued if the next block starts with that
1186  * hash value.  This is used if we are searching for a specific file.
1187  *
1188  * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
1189  *
1190  * This function returns 1 if the caller should continue to search,
1191  * or 0 if it should not.  If there is an error reading one of the
1192  * index blocks, it will a negative error code.
1193  *
1194  * If start_hash is non-null, it will be filled in with the starting
1195  * hash of the next page.
1196  */
1197 static int iam_htree_advance(struct inode *dir, __u32 hash,
1198                               struct iam_path *path, __u32 *start_hash,
1199                               int compat)
1200 {
1201         struct iam_frame *p;
1202         struct buffer_head *bh;
1203         int err, num_frames = 0;
1204         __u32 bhash;
1205
1206         p = path->ip_frame;
1207         /*
1208          * Find the next leaf page by incrementing the frame pointer.
1209          * If we run out of entries in the interior node, loop around and
1210          * increment pointer in the parent node.  When we break out of
1211          * this loop, num_frames indicates the number of interior
1212          * nodes need to be read.
1213          */
1214         while (1) {
1215                 do_corr(schedule());
1216                 iam_lock_bh(p->bh);
1217                 p->at = iam_entry_shift(path, p->at, +1);
1218                 if (p->at < iam_entry_shift(path, p->entries,
1219                                             dx_get_count(p->entries))) {
1220                         p->leaf = dx_get_block(path, p->at);
1221                         iam_unlock_bh(p->bh);
1222                         break;
1223                 }
1224                 iam_unlock_bh(p->bh);
1225                 if (p == path->ip_frames)
1226                         return 0;
1227                 num_frames++;
1228                 --p;
1229         }
1230
1231         if (compat) {
1232                 /*
1233                  * Htree hash magic.
1234                  */
1235         /*
1236          * If the hash is 1, then continue only if the next page has a
1237          * continuation hash of any value.  This is used for readdir
1238          * handling.  Otherwise, check to see if the hash matches the
1239          * desired contiuation hash.  If it doesn't, return since
1240          * there's no point to read in the successive index pages.
1241          */
1242                 dx_get_ikey(path, p->at, (struct iam_ikey *)&bhash);
1243         if (start_hash)
1244                 *start_hash = bhash;
1245         if ((hash & 1) == 0) {
1246                 if ((bhash & ~1) != hash)
1247                         return 0;
1248         }
1249         }
1250         /*
1251          * If the hash is HASH_NB_ALWAYS, we always go to the next
1252          * block so no check is necessary
1253          */
1254         while (num_frames--) {
1255                 iam_ptr_t idx;
1256
1257                 do_corr(schedule());
1258                 iam_lock_bh(p->bh);
1259                 idx = p->leaf = dx_get_block(path, p->at);
1260                 iam_unlock_bh(p->bh);
1261                 err = iam_path_descr(path)->id_ops->
1262                         id_node_read(path->ip_container, idx, NULL, &bh);
1263                 if (err != 0)
1264                         return err; /* Failure */
1265                 ++p;
1266                 brelse(p->bh);
1267                 assert_corr(p->bh != bh);
1268                 p->bh = bh;
1269                 p->entries = dx_node_get_entries(path, p);
1270                 p->at = iam_entry_shift(path, p->entries, !compat);
1271                 assert_corr(p->curidx != idx);
1272                 p->curidx = idx;
1273                 iam_lock_bh(p->bh);
1274                 assert_corr(p->leaf != dx_get_block(path, p->at));
1275                 p->leaf = dx_get_block(path, p->at);
1276                 iam_unlock_bh(p->bh);
1277                 assert_inv(dx_node_check(path, p));
1278         }
1279         return 1;
1280 }
1281
1282
1283 static inline int iam_index_advance(struct iam_path *path)
1284 {
1285         return iam_htree_advance(iam_path_obj(path), 0, path, NULL, 0);
1286 }
1287
1288 static void iam_unlock_array(struct inode *dir, struct dynlock_handle **lh)
1289 {
1290         int i;
1291
1292         for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) {
1293                 if (*lh != NULL) {
1294                         iam_unlock_htree(dir, *lh);
1295                         *lh = NULL;
1296                 }
1297         }
1298 }
1299 /*
1300  * Advance index part of @path to point to the next leaf. Returns 1 on
1301  * success, 0, when end of container was reached. Leaf node is locked.
1302  */
1303 int iam_index_next(struct iam_container *c, struct iam_path *path)
1304 {
1305         iam_ptr_t cursor;
1306         struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { 0, };
1307         int result;
1308         struct inode *object;
1309
1310         /*
1311          * Locking for iam_index_next()... is to be described.
1312          */
1313
1314         object = c->ic_object;
1315         cursor = path->ip_frame->leaf;
1316
1317         while (1) {
1318                 result = iam_index_lock(path, lh);
1319                 do_corr(schedule());
1320                 if (result < 0)
1321                         break;
1322
1323                 result = iam_check_full_path(path, 0);
1324                 if (result == 0 && cursor == path->ip_frame->leaf) {
1325                         result = iam_index_advance(path);
1326
1327                         assert_corr(result == 0 ||
1328                                     cursor != path->ip_frame->leaf);
1329                         break;
1330                 }
1331                 do {
1332                         iam_unlock_array(object, lh);
1333
1334                         iam_path_release(path);
1335                         do_corr(schedule());
1336
1337                         result = __iam_path_lookup(path);
1338                         if (result < 0)
1339                                 break;
1340
1341                         while (path->ip_frame->leaf != cursor) {
1342                                 do_corr(schedule());
1343
1344                                 result = iam_index_lock(path, lh);
1345                                 do_corr(schedule());
1346                                 if (result < 0)
1347                                         break;
1348
1349                                 result = iam_check_full_path(path, 0);
1350                                 if (result != 0)
1351                                         break;
1352
1353                                 result = iam_index_advance(path);
1354                                 if (result == 0) {
1355                                         CERROR("cannot find cursor : %u\n",
1356                                                 cursor);
1357                                         result = -EIO;
1358                                 }
1359                                 if (result < 0)
1360                                         break;
1361                                 result = iam_check_full_path(path, 0);
1362                                 if (result != 0)
1363                                         break;
1364                                 iam_unlock_array(object, lh);
1365                         }
1366                 } while (result == -EAGAIN);
1367                 if (result < 0)
1368                         break;
1369         }
1370         iam_unlock_array(object, lh);
1371         return result;
1372 }
1373
1374 /*
1375  * Move iterator one record right.
1376  *
1377  * Return value: 0: success,
1378  *              +1: end of container reached
1379  *             -ve: error
1380  *
1381  * precondition:  (it_state(it) == IAM_IT_ATTACHED ||
1382  *                 it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE
1383  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) &&
1384  *                ergo(result >  0, it_state(it) == IAM_IT_DETACHED)
1385  */
1386 int iam_it_next(struct iam_iterator *it)
1387 {
1388         int result;
1389         struct iam_path      *path;
1390         struct iam_leaf      *leaf;
1391         struct inode         *obj;
1392         do_corr(struct iam_ikey *ik_orig);
1393
1394         /* assert_corr(it->ii_flags&IAM_IT_MOVE); */
1395         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1396                     it_state(it) == IAM_IT_SKEWED);
1397
1398         path = &it->ii_path;
1399         leaf = &path->ip_leaf;
1400         obj  = iam_path_obj(path);
1401
1402         assert_corr(iam_leaf_is_locked(leaf));
1403
1404         result = 0;
1405         do_corr(ik_orig = it_at_rec(it) ?
1406                 iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL);
1407         if (it_before(it)) {
1408                 assert_corr(!iam_leaf_at_end(leaf));
1409                 it->ii_state = IAM_IT_ATTACHED;
1410         } else {
1411                 if (!iam_leaf_at_end(leaf))
1412                         /* advance within leaf node */
1413                         iam_leaf_next(leaf);
1414                 /*
1415                  * multiple iterations may be necessary due to empty leaves.
1416                  */
1417                 while (result == 0 && iam_leaf_at_end(leaf)) {
1418                         do_corr(schedule());
1419                         /* advance index portion of the path */
1420                         result = iam_index_next(iam_it_container(it), path);
1421                         assert_corr(iam_leaf_is_locked(leaf));
1422                         if (result == 1) {
1423                                 struct dynlock_handle *lh;
1424                                 lh = iam_lock_htree(obj, path->ip_frame->leaf,
1425                                                    DLT_WRITE);
1426                                 if (lh != NULL) {
1427                                         iam_leaf_fini(leaf);
1428                                         leaf->il_lock = lh;
1429                                         result = iam_leaf_load(path);
1430                                         if (result == 0)
1431                                                 iam_leaf_start(leaf);
1432                                 } else
1433                                         result = -ENOMEM;
1434                         } else if (result == 0)
1435                                 /* end of container reached */
1436                                 result = +1;
1437                         if (result != 0)
1438                                 iam_it_put(it);
1439                 }
1440                 if (result == 0)
1441                         it->ii_state = IAM_IT_ATTACHED;
1442         }
1443         assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED));
1444         assert_corr(ergo(result >  0, it_state(it) == IAM_IT_DETACHED));
1445         assert_corr(ergo(result == 0 && ik_orig != NULL,
1446                          it_ikeycmp(it, ik_orig) >= 0));
1447         return result;
1448 }
1449 EXPORT_SYMBOL(iam_it_next);
1450
1451 /*
1452  * Return pointer to the record under iterator.
1453  *
1454  * precondition:  it_state(it) == IAM_IT_ATTACHED && it_at_rec(it)
1455  * postcondition: it_state(it) == IAM_IT_ATTACHED
1456  */
1457 struct iam_rec *iam_it_rec_get(const struct iam_iterator *it)
1458 {
1459         assert_corr(it_state(it) == IAM_IT_ATTACHED);
1460         assert_corr(it_at_rec(it));
1461         return iam_leaf_rec(&it->ii_path.ip_leaf);
1462 }
1463 EXPORT_SYMBOL(iam_it_rec_get);
1464
1465 static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r)
1466 {
1467         struct iam_leaf *folio;
1468
1469         folio = &it->ii_path.ip_leaf;
1470         iam_leaf_ops(folio)->rec_set(folio, r);
1471 }
1472
1473 /*
1474  * Replace contents of record under iterator.
1475  *
1476  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
1477  *                it->ii_flags&IAM_IT_WRITE
1478  * postcondition: it_state(it) == IAM_IT_ATTACHED &&
1479  *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
1480  */
1481 int iam_it_rec_set(handle_t *h,
1482                    struct iam_iterator *it, const struct iam_rec *r)
1483 {
1484         int result;
1485         struct iam_path *path;
1486         struct buffer_head *bh;
1487
1488         assert_corr(it_state(it) == IAM_IT_ATTACHED &&
1489                     it->ii_flags&IAM_IT_WRITE);
1490         assert_corr(it_at_rec(it));
1491
1492         path = &it->ii_path;
1493         bh   = path->ip_leaf.il_bh;
1494         result = iam_txn_add(h, path, bh);
1495         if (result == 0) {
1496                 iam_it_reccpy(it, r);
1497                 result = iam_txn_dirty(h, path, bh);
1498         }
1499         return result;
1500 }
1501 EXPORT_SYMBOL(iam_it_rec_set);
1502
1503 /*
1504  * Return pointer to the index key under iterator.
1505  *
1506  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1507  *                it_state(it) == IAM_IT_SKEWED
1508  */
1509 static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
1510                                         struct iam_ikey *ikey)
1511 {
1512         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1513                     it_state(it) == IAM_IT_SKEWED);
1514         assert_corr(it_at_rec(it));
1515         return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey);
1516 }
1517
1518 /*
1519  * Return pointer to the key under iterator.
1520  *
1521  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1522  *                it_state(it) == IAM_IT_SKEWED
1523  */
1524 struct iam_key *iam_it_key_get(const struct iam_iterator *it)
1525 {
1526         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1527                     it_state(it) == IAM_IT_SKEWED);
1528         assert_corr(it_at_rec(it));
1529         return iam_leaf_key(&it->ii_path.ip_leaf);
1530 }
1531 EXPORT_SYMBOL(iam_it_key_get);
1532
1533 /*
1534  * Return size of key under iterator (in bytes)
1535  *
1536  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1537  *                it_state(it) == IAM_IT_SKEWED
1538  */
1539 int iam_it_key_size(const struct iam_iterator *it)
1540 {
1541         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1542                     it_state(it) == IAM_IT_SKEWED);
1543         assert_corr(it_at_rec(it));
1544         return iam_leaf_key_size(&it->ii_path.ip_leaf);
1545 }
1546 EXPORT_SYMBOL(iam_it_key_size);
1547
1548 /*
1549  * Insertion of new record. Interaction with jbd during non-trivial case (when
1550  * split happens) is as following:
1551  *
1552  *  - new leaf node is involved into transaction by ldiskfs_append();
1553  *
1554  *  - old leaf node is involved into transaction by iam_add_rec();
1555  *
1556  *  - leaf where insertion point ends in, is marked dirty by iam_add_rec();
1557  *
1558  *  - leaf without insertion point is marked dirty (as @new_leaf) by
1559  *  iam_new_leaf();
1560  *
1561  *  - split index nodes are involved into transaction and marked dirty by
1562  *  split_index_node().
1563  *
1564  *  - "safe" index node, which is no split, but where new pointer is inserted
1565  *  is involved into transaction and marked dirty by split_index_node().
1566  *
1567  *  - index node where pointer to new leaf is inserted is involved into
1568  *  transaction by split_index_node() and marked dirty by iam_add_rec().
1569  *
1570  *  - inode is marked dirty by iam_add_rec().
1571  *
1572  */
1573
1574 static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
1575 {
1576         int err;
1577         iam_ptr_t blknr;
1578         struct buffer_head   *new_leaf;
1579         struct buffer_head   *old_leaf;
1580         struct iam_container *c;
1581         struct inode         *obj;
1582         struct iam_path      *path;
1583
1584         assert_inv(iam_leaf_check(leaf));
1585
1586         c = iam_leaf_container(leaf);
1587         path = leaf->il_path;
1588
1589         obj = c->ic_object;
1590         new_leaf = ldiskfs_append(handle, obj, (__u32 *)&blknr, &err);
1591         do_corr(schedule());
1592         if (new_leaf != NULL) {
1593                 struct dynlock_handle *lh;
1594
1595                 lh = iam_lock_htree(obj, blknr, DLT_WRITE);
1596                 do_corr(schedule());
1597                 if (lh != NULL) {
1598                         iam_leaf_ops(leaf)->init_new(c, new_leaf);
1599                         do_corr(schedule());
1600                         old_leaf = leaf->il_bh;
1601                         iam_leaf_split(leaf, &new_leaf, blknr);
1602                         if (old_leaf != leaf->il_bh) {
1603                                 /*
1604                                  * Switched to the new leaf.
1605                                  */
1606                                 iam_leaf_unlock(leaf);
1607                                 leaf->il_lock = lh;
1608                                 path->ip_frame->leaf = blknr;
1609                         } else
1610                                 iam_unlock_htree(obj, lh);
1611                         do_corr(schedule());
1612                         err = iam_txn_dirty(handle, path, new_leaf);
1613                         brelse(new_leaf);
1614                         if (err == 0)
1615                                 err = ldiskfs_mark_inode_dirty(handle, obj);
1616                         do_corr(schedule());
1617                 } else
1618                         err = -ENOMEM;
1619         }
1620         assert_inv(iam_leaf_check(leaf));
1621         assert_inv(iam_leaf_check(&iam_leaf_path(leaf)->ip_leaf));
1622         assert_inv(iam_path_check(iam_leaf_path(leaf)));
1623         return err;
1624 }
1625
1626 static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
1627 {
1628         ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
1629 }
1630
1631 static int iam_shift_entries(struct iam_path *path,
1632                          struct iam_frame *frame, unsigned count,
1633                          struct iam_entry *entries, struct iam_entry *entries2,
1634                          u32 newblock)
1635 {
1636         unsigned count1;
1637         unsigned count2;
1638         int delta;
1639
1640         struct iam_frame *parent = frame - 1;
1641         struct iam_ikey *pivot = iam_path_ikey(path, 3);
1642
1643         delta = dx_index_is_compat(path) ? 0 : +1;
1644
1645         count1 = count/2 + delta;
1646         count2 = count - count1;
1647         dx_get_ikey(path, iam_entry_shift(path, entries, count1), pivot);
1648
1649         dxtrace(printk("Split index %i/%i\n", count1, count2));
1650
1651         memcpy((char *) iam_entry_shift(path, entries2, delta),
1652                (char *) iam_entry_shift(path, entries, count1),
1653                count2 * iam_entry_size(path));
1654
1655         dx_set_count(entries2, count2 + delta);
1656         dx_set_limit(entries2, dx_node_limit(path));
1657
1658         /*
1659          * NOTE: very subtle piece of code competing dx_probe() may find 2nd
1660          * level index in root index, then we insert new index here and set
1661          * new count in that 2nd level index. so, dx_probe() may see 2nd level
1662          * index w/o hash it looks for. the solution is to check root index
1663          * after we locked just founded 2nd level index -bzzz
1664          */
1665         iam_insert_key_lock(path, parent, pivot, newblock);
1666
1667         /*
1668          * now old and new 2nd level index blocks contain all pointers, so
1669          * dx_probe() may find it in the both.  it's OK -bzzz
1670          */
1671         iam_lock_bh(frame->bh);
1672         dx_set_count(entries, count1);
1673         iam_unlock_bh(frame->bh);
1674
1675         /*
1676          * now old 2nd level index block points to first half of leafs. it's
1677          * importand that dx_probe() must check root index block for changes
1678          * under dx_lock_bh(frame->bh) -bzzz
1679          */
1680
1681         return count1;
1682 }
1683
1684
1685 int split_index_node(handle_t *handle, struct iam_path *path,
1686                      struct dynlock_handle **lh)
1687 {
1688
1689         struct iam_entry *entries;   /* old block contents */
1690         struct iam_entry *entries2;  /* new block contents */
1691          struct iam_frame *frame, *safe;
1692         struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {0};
1693         u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
1694         struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,};
1695         struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,};
1696         struct inode *dir = iam_path_obj(path);
1697         struct iam_descr *descr;
1698         int nr_splet;
1699         int i, err;
1700
1701         descr = iam_path_descr(path);
1702         /*
1703          * Algorithm below depends on this.
1704          */
1705         assert_corr(dx_root_limit(path) < dx_node_limit(path));
1706
1707         frame = path->ip_frame;
1708         entries = frame->entries;
1709
1710         /*
1711          * Tall-tree handling: we might have to split multiple index blocks
1712          * all the way up to tree root. Tricky point here is error handling:
1713          * to avoid complicated undo/rollback we
1714          *
1715          *   - first allocate all necessary blocks
1716          *
1717          *   - insert pointers into them atomically.
1718          */
1719
1720         /*
1721          * Locking: leaf is already locked. htree-locks are acquired on all
1722          * index nodes that require split bottom-to-top, on the "safe" node,
1723          * and on all new nodes
1724          */
1725
1726         dxtrace(printk("using %u of %u node entries\n",
1727                        dx_get_count(entries), dx_get_limit(entries)));
1728
1729         /* What levels need split? */
1730         for (nr_splet = 0; frame >= path->ip_frames &&
1731              dx_get_count(frame->entries) == dx_get_limit(frame->entries);
1732              --frame, ++nr_splet) {
1733                 do_corr(schedule());
1734                 if (nr_splet == DX_MAX_TREE_HEIGHT) {
1735                         /*
1736                         CWARN(dir->i_sb, __FUNCTION__,
1737                                      "Directory index full!\n");
1738                                      */
1739                         err = -ENOSPC;
1740                         goto cleanup;
1741                 }
1742         }
1743
1744         safe = frame;
1745
1746         /*
1747          * Lock all nodes, bottom to top.
1748          */
1749         for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) {
1750                 do_corr(schedule());
1751                 lock[i] = iam_lock_htree(dir, frame->curidx, DLT_WRITE);
1752                 if (lock[i] == NULL) {
1753                         err = -ENOMEM;
1754                         goto cleanup;
1755                 }
1756         }
1757
1758         /*
1759          * Check for concurrent index modification.
1760          */
1761         err = iam_check_full_path(path, 1);
1762         if (err)
1763                 goto cleanup;
1764         /*
1765          * And check that the same number of nodes is to be split.
1766          */
1767         for (i = 0, frame = path->ip_frame; frame >= path->ip_frames &&
1768              dx_get_count(frame->entries) == dx_get_limit(frame->entries);
1769              --frame, ++i) {
1770                 ;
1771         }
1772         if (i != nr_splet) {
1773                 err = -EAGAIN;
1774                 goto cleanup;
1775         }
1776
1777         /* Go back down, allocating blocks, locking them, and adding into
1778          * transaction... */
1779         for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
1780                 bh_new[i] = ldiskfs_append (handle, dir, &newblock[i], &err);
1781                 do_corr(schedule());
1782                 if (!bh_new[i] ||
1783                     descr->id_ops->id_node_init(path->ip_container,
1784                                                 bh_new[i], 0) != 0)
1785                         goto cleanup;
1786                 new_lock[i] = iam_lock_htree(dir, newblock[i], DLT_WRITE);
1787                 if (new_lock[i] == NULL) {
1788                         err = -ENOMEM;
1789                         goto cleanup;
1790                 }
1791                 do_corr(schedule());
1792                 BUFFER_TRACE(frame->bh, "get_write_access");
1793                 err = ldiskfs_journal_get_write_access(handle, frame->bh);
1794                 if (err)
1795                         goto journal_error;
1796         }
1797         /* Add "safe" node to transaction too */
1798         if (safe + 1 != path->ip_frames) {
1799                 do_corr(schedule());
1800                 err = ldiskfs_journal_get_write_access(handle, safe->bh);
1801                 if (err)
1802                         goto journal_error;
1803         }
1804
1805         /* Go through nodes once more, inserting pointers */
1806         for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
1807                 unsigned count;
1808                 int idx;
1809                 struct buffer_head *bh2;
1810                 struct buffer_head *bh;
1811
1812                 entries = frame->entries;
1813                 count = dx_get_count(entries);
1814                 idx = iam_entry_diff(path, frame->at, entries);
1815
1816                 bh2 = bh_new[i];
1817                 entries2 = dx_get_entries(path, bh2->b_data, 0);
1818
1819                 bh = frame->bh;
1820                 if (frame == path->ip_frames) {
1821                         /* splitting root node. Tricky point:
1822                          *
1823                          * In the "normal" B-tree we'd split root *and* add
1824                          * new root to the tree with pointers to the old root
1825                          * and its sibling (thus introducing two new nodes).
1826                          *
1827                          * In htree it's enough to add one node, because
1828                          * capacity of the root node is smaller than that of
1829                          * non-root one.
1830                          */
1831                         struct iam_frame *frames;
1832                         struct iam_entry *next;
1833
1834                         assert_corr(i == 0);
1835
1836                         do_corr(schedule());
1837
1838                         frames = path->ip_frames;
1839                         memcpy((char *) entries2, (char *) entries,
1840                                count * iam_entry_size(path));
1841                         dx_set_limit(entries2, dx_node_limit(path));
1842
1843                         /* Set up root */
1844                           iam_lock_bh(frame->bh);
1845                         next = descr->id_ops->id_root_inc(path->ip_container,
1846                                                           path, frame);
1847                         dx_set_block(path, next, newblock[0]);
1848                           iam_unlock_bh(frame->bh);
1849
1850                         do_corr(schedule());
1851                         /* Shift frames in the path */
1852                         memmove(frames + 2, frames + 1,
1853                                 (sizeof path->ip_frames) - 2 * sizeof frames[0]);
1854                         /* Add new access path frame */
1855                         frames[1].at = iam_entry_shift(path, entries2, idx);
1856                         frames[1].entries = entries = entries2;
1857                         frames[1].bh = bh2;
1858                         assert_inv(dx_node_check(path, frame));
1859                         ++ path->ip_frame;
1860                         ++ frame;
1861                         assert_inv(dx_node_check(path, frame));
1862                         bh_new[0] = NULL; /* buffer head is "consumed" */
1863                         err = ldiskfs_journal_get_write_access(handle, bh2);
1864                         if (err)
1865                                 goto journal_error;
1866                         do_corr(schedule());
1867                 } else {
1868                         /* splitting non-root index node. */
1869                         struct iam_frame *parent = frame - 1;
1870
1871                         do_corr(schedule());
1872                         count = iam_shift_entries(path, frame, count,
1873                                               entries, entries2, newblock[i]);
1874                         /* Which index block gets the new entry? */
1875                         if (idx >= count) {
1876                                 int d = dx_index_is_compat(path) ? 0 : +1;
1877
1878                                 frame->at = iam_entry_shift(path, entries2,
1879                                                             idx - count + d);
1880                                 frame->entries = entries = entries2;
1881                                 frame->curidx = newblock[i];
1882                                 swap(frame->bh, bh2);
1883                                 assert_corr(lock[i + 1] != NULL);
1884                                 assert_corr(new_lock[i] != NULL);
1885                                 swap(lock[i + 1], new_lock[i]);
1886                                 bh_new[i] = bh2;
1887                                 parent->at = iam_entry_shift(path,
1888                                                              parent->at, +1);
1889                         }
1890                         assert_inv(dx_node_check(path, frame));
1891                         assert_inv(dx_node_check(path, parent));
1892                         dxtrace(dx_show_index ("node", frame->entries));
1893                         dxtrace(dx_show_index ("node",
1894                                ((struct dx_node *) bh2->b_data)->entries));
1895                         err = ldiskfs_journal_dirty_metadata(handle, bh2);
1896                         if (err)
1897                                 goto journal_error;
1898                         do_corr(schedule());
1899                         err = ldiskfs_journal_dirty_metadata(handle, parent->bh);
1900                         if (err)
1901                                 goto journal_error;
1902                 }
1903                 do_corr(schedule());
1904                 err = ldiskfs_journal_dirty_metadata(handle, bh);
1905                 if (err)
1906                         goto journal_error;
1907         }
1908                 /*
1909                  * This function was called to make insertion of new leaf
1910                  * possible. Check that it fulfilled its obligations.
1911                  */
1912                 assert_corr(dx_get_count(path->ip_frame->entries) <
1913                             dx_get_limit(path->ip_frame->entries));
1914         assert_corr(lock[nr_splet] != NULL);
1915         *lh = lock[nr_splet];
1916         lock[nr_splet] = NULL;
1917         if (nr_splet > 0) {
1918                 /*
1919                  * Log ->i_size modification.
1920                  */
1921                 err = ldiskfs_mark_inode_dirty(handle, dir);
1922                 if (err)
1923                         goto journal_error;
1924         }
1925         goto cleanup;
1926 journal_error:
1927         ldiskfs_std_error(dir->i_sb, err);
1928
1929 cleanup:
1930         iam_unlock_array(dir, lock);
1931         iam_unlock_array(dir, new_lock);
1932
1933         assert_corr(err || iam_frame_is_locked(path, path->ip_frame));
1934
1935         do_corr(schedule());
1936         for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
1937                 if (bh_new[i] != NULL)
1938                         brelse(bh_new[i]);
1939         }
1940         return err;
1941 }
1942
1943 static int iam_add_rec(handle_t *handle, struct iam_iterator *it,
1944                        struct iam_path *path,
1945                        const struct iam_key *k, const struct iam_rec *r)
1946 {
1947         int err;
1948         struct iam_leaf *leaf;
1949
1950         leaf = &path->ip_leaf;
1951         assert_inv(iam_leaf_check(leaf));
1952         assert_inv(iam_path_check(path));
1953         err = iam_txn_add(handle, path, leaf->il_bh);
1954         if (err == 0) {
1955                 do_corr(schedule());
1956                 if (!iam_leaf_can_add(leaf, k, r)) {
1957                         struct dynlock_handle *lh = NULL;
1958
1959                         do {
1960                                 assert_corr(lh == NULL);
1961                                 do_corr(schedule());
1962                                 err = split_index_node(handle, path, &lh);
1963                                 if (err == -EAGAIN) {
1964                                         assert_corr(lh == NULL);
1965
1966                                         iam_path_fini(path);
1967                                         it->ii_state = IAM_IT_DETACHED;
1968
1969                                         do_corr(schedule());
1970                                         err = iam_it_get_exact(it, k);
1971                                         if (err == -ENOENT)
1972                                                 err = +1; /* repeat split */
1973                                         else if (err == 0)
1974                                                 err = -EEXIST;
1975                                 }
1976                         } while (err > 0);
1977                         assert_inv(iam_path_check(path));
1978                         if (err == 0) {
1979                                 assert_corr(lh != NULL);
1980                                 do_corr(schedule());
1981                                 err = iam_new_leaf(handle, leaf);
1982                                 if (err == 0)
1983                                         err = iam_txn_dirty(handle, path,
1984                                                             path->ip_frame->bh);
1985                         }
1986                         iam_unlock_htree(iam_path_obj(path), lh);
1987                         do_corr(schedule());
1988                 }
1989                 if (err == 0) {
1990                         iam_leaf_rec_add(leaf, k, r);
1991                         err = iam_txn_dirty(handle, path, leaf->il_bh);
1992                 }
1993         }
1994         assert_inv(iam_leaf_check(leaf));
1995         assert_inv(iam_leaf_check(&path->ip_leaf));
1996         assert_inv(iam_path_check(path));
1997         return err;
1998 }
1999
2000 /*
2001  * Insert new record with key @k and contents from @r, shifting records to the
2002  * right. On success, iterator is positioned on the newly inserted record.
2003  *
2004  * precondition: it->ii_flags&IAM_IT_WRITE &&
2005  *               (it_state(it) == IAM_IT_ATTACHED ||
2006  *                it_state(it) == IAM_IT_SKEWED) &&
2007  *               ergo(it_state(it) == IAM_IT_ATTACHED,
2008  *                    it_keycmp(it, k) <= 0) &&
2009  *               ergo(it_before(it), it_keycmp(it, k) > 0));
2010  * postcondition: ergo(result == 0,
2011  *                     it_state(it) == IAM_IT_ATTACHED &&
2012  *                     it_keycmp(it, k) == 0 &&
2013  *                     !memcmp(iam_it_rec_get(it), r, ...))
2014  */
2015 int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
2016                       const struct iam_key *k, const struct iam_rec *r)
2017 {
2018         int result;
2019         struct iam_path *path;
2020
2021         path = &it->ii_path;
2022
2023         assert_corr(it->ii_flags&IAM_IT_WRITE);
2024         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
2025                     it_state(it) == IAM_IT_SKEWED);
2026         assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED,
2027                          it_keycmp(it, k) <= 0));
2028         assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0));
2029         result = iam_add_rec(h, it, path, k, r);
2030         if (result == 0)
2031                 it->ii_state = IAM_IT_ATTACHED;
2032         assert_corr(ergo(result == 0,
2033                          it_state(it) == IAM_IT_ATTACHED &&
2034                          it_keycmp(it, k) == 0));
2035         return result;
2036 }
2037 EXPORT_SYMBOL(iam_it_rec_insert);
2038
2039 /*
2040  * Delete record under iterator.
2041  *
2042  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
2043  *                it->ii_flags&IAM_IT_WRITE &&
2044  *                it_at_rec(it)
2045  * postcondition: it_state(it) == IAM_IT_ATTACHED ||
2046  *                it_state(it) == IAM_IT_DETACHED
2047  */
2048 int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
2049 {
2050         int result;
2051         struct iam_leaf *leaf;
2052         struct iam_path *path;
2053
2054         assert_corr(it_state(it) == IAM_IT_ATTACHED &&
2055                     it->ii_flags&IAM_IT_WRITE);
2056         assert_corr(it_at_rec(it));
2057
2058         path = &it->ii_path;
2059         leaf = &path->ip_leaf;
2060
2061         assert_inv(iam_leaf_check(leaf));
2062         assert_inv(iam_path_check(path));
2063
2064         result = iam_txn_add(h, path, leaf->il_bh);
2065         /*
2066          * no compaction for now.
2067          */
2068         if (result == 0) {
2069                 iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE);
2070                 result = iam_txn_dirty(h, path, leaf->il_bh);
2071                 if (result == 0 && iam_leaf_at_end(leaf) &&
2072                     it->ii_flags&IAM_IT_MOVE) {
2073                         result = iam_it_next(it);
2074                         if (result > 0)
2075                                 result = 0;
2076                 }
2077         }
2078         assert_inv(iam_leaf_check(leaf));
2079         assert_inv(iam_path_check(path));
2080         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
2081                     it_state(it) == IAM_IT_DETACHED);
2082         return result;
2083 }
2084 EXPORT_SYMBOL(iam_it_rec_delete);
2085
2086 /*
2087  * Convert iterator to cookie.
2088  *
2089  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
2090  *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
2091  * postcondition: it_state(it) == IAM_IT_ATTACHED
2092  */
2093 iam_pos_t iam_it_store(const struct iam_iterator *it)
2094 {
2095         iam_pos_t result;
2096
2097         assert_corr(it_state(it) == IAM_IT_ATTACHED);
2098         assert_corr(it_at_rec(it));
2099         assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <=
2100                     sizeof result);
2101
2102         result = 0;
2103         return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result);
2104 }
2105 EXPORT_SYMBOL(iam_it_store);
2106
2107 /*
2108  * Restore iterator from cookie.
2109  *
2110  * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
2111  *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
2112  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
2113  *                                  iam_it_store(it) == pos)
2114  */
2115 int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
2116 {
2117         assert_corr(it_state(it) == IAM_IT_DETACHED &&
2118                     it->ii_flags&IAM_IT_MOVE);
2119         assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos);
2120         return iam_it_iget(it, (struct iam_ikey *)&pos);
2121 }
2122 EXPORT_SYMBOL(iam_it_load);
2123
2124 /***********************************************************************/
2125 /* invariants                                                          */
2126 /***********************************************************************/
2127
2128 static inline int ptr_inside(void *base, size_t size, void *ptr)
2129 {
2130         return (base <= ptr) && (ptr < base + size);
2131 }
2132
2133 int iam_frame_invariant(struct iam_frame *f)
2134 {
2135         return
2136                 (f->bh != NULL &&
2137                 f->bh->b_data != NULL &&
2138                 ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
2139                 ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
2140                 f->entries <= f->at);
2141 }
2142 int iam_leaf_invariant(struct iam_leaf *l)
2143 {
2144         return
2145                 l->il_bh != NULL &&
2146                 l->il_bh->b_data != NULL &&
2147                 ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
2148                 ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
2149                 l->il_entries <= l->il_at;
2150 }
2151
2152 int iam_path_invariant(struct iam_path *p)
2153 {
2154         int i;
2155
2156         if (p->ip_container == NULL ||
2157             p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
2158             p->ip_frame != p->ip_frames + p->ip_indirect ||
2159             !iam_leaf_invariant(&p->ip_leaf))
2160                 return 0;
2161         for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
2162                 if (i <= p->ip_indirect) {
2163                         if (!iam_frame_invariant(&p->ip_frames[i]))
2164                                 return 0;
2165                 }
2166         }
2167         return 1;
2168 }
2169
2170 int iam_it_invariant(struct iam_iterator *it)
2171 {
2172         return
2173                 (it->ii_state == IAM_IT_DETACHED ||
2174                  it->ii_state == IAM_IT_ATTACHED ||
2175                  it->ii_state == IAM_IT_SKEWED) &&
2176                 !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
2177                 ergo(it->ii_state == IAM_IT_ATTACHED ||
2178                      it->ii_state == IAM_IT_SKEWED,
2179                      iam_path_invariant(&it->ii_path) &&
2180                      equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED));
2181 }
2182
2183 /*
2184  * Search container @c for record with key @k. If record is found, its data
2185  * are moved into @r.
2186  *
2187  * Return values: 0: found, -ENOENT: not-found, -ve: error
2188  */
2189 int iam_lookup(struct iam_container *c, const struct iam_key *k,
2190                struct iam_rec *r, struct iam_path_descr *pd)
2191 {
2192         struct iam_iterator it;
2193         int result;
2194
2195         iam_it_init(&it, c, 0, pd);
2196
2197         result = iam_it_get_exact(&it, k);
2198         if (result == 0)
2199                 /*
2200                  * record with required key found, copy it into user buffer
2201                  */
2202                 iam_reccpy(&it.ii_path.ip_leaf, r);
2203         iam_it_put(&it);
2204         iam_it_fini(&it);
2205         return result;
2206 }
2207 EXPORT_SYMBOL(iam_lookup);
2208
2209 /*
2210  * Insert new record @r with key @k into container @c (within context of
2211  * transaction @h).
2212  *
2213  * Return values: 0: success, -ve: error, including -EEXIST when record with
2214  * given key is already present.
2215  *
2216  * postcondition: ergo(result == 0 || result == -EEXIST,
2217  *                                  iam_lookup(c, k, r2) > 0;
2218  */
2219 int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k,
2220                const struct iam_rec *r, struct iam_path_descr *pd)
2221 {
2222         struct iam_iterator it;
2223         int result;
2224
2225         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2226
2227         result = iam_it_get_exact(&it, k);
2228         if (result == -ENOENT)
2229                 result = iam_it_rec_insert(h, &it, k, r);
2230         else if (result == 0)
2231                 result = -EEXIST;
2232         iam_it_put(&it);
2233         iam_it_fini(&it);
2234         return result;
2235 }
2236 EXPORT_SYMBOL(iam_insert);
2237
2238 /*
2239  * Update record with the key @k in container @c (within context of
2240  * transaction @h), new record is given by @r.
2241  *
2242  * Return values: 0: success, -ve: error, including -ENOENT if no record with
2243  * the given key found.
2244  */
2245 int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
2246                const struct iam_rec *r, struct iam_path_descr *pd)
2247 {
2248         struct iam_iterator it;
2249         int result;
2250
2251         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2252
2253         result = iam_it_get_exact(&it, k);
2254         if (result == 0)
2255                 iam_it_rec_set(h, &it, r);
2256         iam_it_put(&it);
2257         iam_it_fini(&it);
2258         return result;
2259 }
2260 EXPORT_SYMBOL(iam_update);
2261
2262 /*
2263  * Delete existing record with key @k.
2264  *
2265  * Return values: 0: success, -ENOENT: not-found, -ve: other error.
2266  *
2267  * postcondition: ergo(result == 0 || result == -ENOENT,
2268  *                                 !iam_lookup(c, k, *));
2269  */
2270 int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
2271                struct iam_path_descr *pd)
2272 {
2273         struct iam_iterator it;
2274         int result;
2275
2276         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2277
2278         result = iam_it_get_exact(&it, k);
2279         if (result == 0)
2280                 iam_it_rec_delete(h, &it);
2281         iam_it_put(&it);
2282         iam_it_fini(&it);
2283         return result;
2284 }
2285 EXPORT_SYMBOL(iam_delete);
2286
2287 int iam_root_limit(int rootgap, int blocksize, int size)
2288 {
2289         int limit;
2290         int nlimit;
2291
2292         limit = (blocksize - rootgap) / size;
2293         nlimit = blocksize / size;
2294         if (limit == nlimit)
2295                 limit--;
2296         return limit;
2297 }