Whamcloud - gitweb
LU-9679 general: avoid bare return; at end of void function
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_iam.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2011, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * iam.c
33  * Top-level entry points into iam module
34  *
35  * Author: Wang Di <wangdi@clusterfs.com>
36  * Author: Nikita Danilov <nikita@clusterfs.com>
37  */
38
39 /*
40  * iam: big theory statement.
41  *
42  * iam (Index Access Module) is a module providing abstraction of persistent
43  * transactional container on top of generalized ldiskfs htree.
44  *
45  * iam supports:
46  *
47  *     - key, pointer, and record size specifiable per container.
48  *
49  *     - trees taller than 2 index levels.
50  *
51  *     - read/write to existing ldiskfs htree directories as iam containers.
52  *
53  * iam container is a tree, consisting of leaf nodes containing keys and
54  * records stored in this container, and index nodes, containing keys and
55  * pointers to leaf or index nodes.
56  *
57  * iam does not work with keys directly, instead it calls user-supplied key
58  * comparison function (->dpo_keycmp()).
59  *
60  * Pointers are (currently) interpreted as logical offsets (measured in
61  * blocksful) within underlying flat file on top of which iam tree lives.
62  *
63  * On-disk format:
64  *
65  * iam mostly tries to reuse existing htree formats.
66  *
67  * Format of index node:
68  *
69  * +-----+-------+-------+-------+------+-------+------------+
70  * |     | count |       |       |      |       |            |
71  * | gap |   /   | entry | entry | .... | entry | free space |
72  * |     | limit |       |       |      |       |            |
73  * +-----+-------+-------+-------+------+-------+------------+
74  *
75  *       gap           this part of node is never accessed by iam code. It
76  *                     exists for binary compatibility with ldiskfs htree (that,
77  *                     in turn, stores fake struct ext2_dirent for ext2
78  *                     compatibility), and to keep some unspecified per-node
79  *                     data. Gap can be different for root and non-root index
80  *                     nodes. Gap size can be specified for each container
81  *                     (gap of 0 is allowed).
82  *
83  *       count/limit   current number of entries in this node, and the maximal
84  *                     number of entries that can fit into node. count/limit
85  *                     has the same size as entry, and is itself counted in
86  *                     count.
87  *
88  *       entry         index entry: consists of a key immediately followed by
89  *                     a pointer to a child node. Size of a key and size of a
90  *                     pointer depends on container. Entry has neither
91  *                     alignment nor padding.
92  *
93  *       free space    portion of node new entries are added to
94  *
95  * Entries in index node are sorted by their key value.
96  *
97  * Format of a leaf node is not specified. Generic iam code accesses leaf
98  * nodes through ->id_leaf methods in struct iam_descr.
99  *
100  * The IAM root block is a special node, which contains the IAM descriptor.
101  * It is on disk format:
102  *
103  * +---------+-------+--------+---------+-------+------+-------+------------+
104  * |IAM desc | count |  idle  |         |       |      |       |            |
105  * |(fix/var)|   /   | blocks | padding | entry | .... | entry | free space |
106  * |         | limit |        |         |       |      |       |            |
107  * +---------+-------+--------+---------+-------+------+-------+------------+
108  *
109  * The padding length is calculated with the parameters in the IAM descriptor.
110  *
111  * The field "idle_blocks" is used to record empty leaf nodes, which have not
112  * been released but all contained entries in them have been removed. Usually,
113  * the idle blocks in the IAM should be reused when need to allocate new leaf
114  * nodes for new entries, it depends on the IAM hash functions to map the new
115  * entries to these idle blocks. Unfortunately, it is not easy to design some
116  * hash functions for such clever mapping, especially considering the insert/
117  * lookup performance.
118  *
119  * So the IAM recycles the empty leaf nodes, and put them into a per-file based
120  * idle blocks pool. If need some new leaf node, it will try to take idle block
121  * from such pool with priority, in spite of how the IAM hash functions to map
122  * the entry.
123  *
124  * The idle blocks pool is organized as a series of tables, and each table
125  * can be described as following (on-disk format):
126  *
127  * +---------+---------+---------+---------+------+---------+-------+
128  * |  magic  |  count  |  next   |  logic  |      |  logic  | free  |
129  * |(16 bits)|(16 bits)|  table  |  blk #  | .... |  blk #  | space |
130  * |         |         |(32 bits)|(32 bits)|      |(32 bits)|       |
131  * +---------+---------+---------+---------+------+---------+-------+
132  *
133  * The logic blk# for the first table is stored in the root node "idle_blocks".
134  *
135  */
136
137 #include <linux/module.h>
138 #include <linux/fs.h>
139 #include <linux/pagemap.h>
140 #include <linux/time.h>
141 #include <linux/fcntl.h>
142 #include <linux/stat.h>
143 #include <linux/string.h>
144 #include <linux/quotaops.h>
145 #include <linux/buffer_head.h>
146
147 #include <ldiskfs/ldiskfs.h>
148 #include <ldiskfs/xattr.h>
149 #undef ENTRY
150
151 #include "osd_internal.h"
152
153 #include <ldiskfs/acl.h>
154
155 /*
156  * List of all registered formats.
157  *
158  * No locking. Callers synchronize.
159  */
160 static struct list_head iam_formats = LIST_HEAD_INIT(iam_formats);
161
162 void iam_format_register(struct iam_format *fmt)
163 {
164         list_add(&fmt->if_linkage, &iam_formats);
165 }
166
167 static struct buffer_head *
168 iam_load_idle_blocks(struct iam_container *c, iam_ptr_t blk)
169 {
170         struct inode *inode = c->ic_object;
171         struct iam_idle_head *head;
172         struct buffer_head *bh;
173
174         LASSERT(mutex_is_locked(&c->ic_idle_mutex));
175
176         if (blk == 0)
177                 return NULL;
178
179         bh = __ldiskfs_bread(NULL, inode, blk, 0);
180         if (IS_ERR_OR_NULL(bh)) {
181                 CERROR("%s: cannot load idle blocks, blk = %u, err = %ld\n",
182                        osd_ino2name(inode), blk, bh ? PTR_ERR(bh) : -EIO);
183                 c->ic_idle_failed = 1;
184                 if (bh == NULL)
185                         bh = ERR_PTR(-EIO);
186                 return bh;
187         }
188
189         head = (struct iam_idle_head *)(bh->b_data);
190         if (le16_to_cpu(head->iih_magic) != IAM_IDLE_HEADER_MAGIC) {
191                 CERROR("%s: invalid idle block head, blk = %u, magic = %d\n",
192                        osd_ino2name(inode), blk, le16_to_cpu(head->iih_magic));
193                 brelse(bh);
194                 c->ic_idle_failed = 1;
195                 return ERR_PTR(-EBADF);
196         }
197
198         return bh;
199 }
200
201 /*
202  * Determine format of given container. This is done by scanning list of
203  * registered formats and calling ->if_guess() method of each in turn.
204  */
205 static int iam_format_guess(struct iam_container *c)
206 {
207         int result;
208         struct iam_format *fmt;
209
210         /*
211          * XXX temporary initialization hook.
212          */
213         {
214                 static int initialized = 0;
215
216                 if (!initialized) {
217                         iam_lvar_format_init();
218                         iam_lfix_format_init();
219                         initialized = 1;
220                 }
221         }
222
223         result = -ENOENT;
224         list_for_each_entry(fmt, &iam_formats, if_linkage) {
225                 result = fmt->if_guess(c);
226                 if (result == 0)
227                         break;
228         }
229
230         if (result == 0) {
231                 struct buffer_head *bh;
232                 __u32 *idle_blocks;
233
234                 LASSERT(c->ic_root_bh != NULL);
235
236                 idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
237                                         c->ic_descr->id_root_gap +
238                                         sizeof(struct dx_countlimit));
239                 mutex_lock(&c->ic_idle_mutex);
240                 bh = iam_load_idle_blocks(c, le32_to_cpu(*idle_blocks));
241                 if (bh != NULL && IS_ERR(bh))
242                         result = PTR_ERR(bh);
243                 else
244                         c->ic_idle_bh = bh;
245                 mutex_unlock(&c->ic_idle_mutex);
246         }
247
248         return result;
249 }
250
251 /*
252  * Initialize container @c.
253  */
254 int iam_container_init(struct iam_container *c,
255                        struct iam_descr *descr, struct inode *inode)
256 {
257         memset(c, 0, sizeof *c);
258         c->ic_descr = descr;
259         c->ic_object = inode;
260         init_rwsem(&c->ic_sem);
261         dynlock_init(&c->ic_tree_lock);
262         mutex_init(&c->ic_idle_mutex);
263         return 0;
264 }
265
266 /*
267  * Determine container format.
268  */
269 int iam_container_setup(struct iam_container *c)
270 {
271         return iam_format_guess(c);
272 }
273
274 /*
275  * Finalize container @c, release all resources.
276  */
277 void iam_container_fini(struct iam_container *c)
278 {
279         brelse(c->ic_idle_bh);
280         c->ic_idle_bh = NULL;
281         brelse(c->ic_root_bh);
282         c->ic_root_bh = NULL;
283 }
284
285 void iam_path_init(struct iam_path *path, struct iam_container *c,
286                    struct iam_path_descr *pd)
287 {
288         memset(path, 0, sizeof *path);
289         path->ip_container = c;
290         path->ip_frame = path->ip_frames;
291         path->ip_data = pd;
292         path->ip_leaf.il_path = path;
293 }
294
295 static void iam_leaf_fini(struct iam_leaf *leaf);
296
297 void iam_path_release(struct iam_path *path)
298 {
299         int i;
300
301         for (i = 0; i < ARRAY_SIZE(path->ip_frames); i++) {
302                 if (path->ip_frames[i].bh != NULL) {
303                         path->ip_frames[i].at_shifted = 0;
304                         brelse(path->ip_frames[i].bh);
305                         path->ip_frames[i].bh = NULL;
306                 }
307         }
308 }
309
310 void iam_path_fini(struct iam_path *path)
311 {
312         iam_leaf_fini(&path->ip_leaf);
313         iam_path_release(path);
314 }
315
316
317 void iam_path_compat_init(struct iam_path_compat *path, struct inode *inode)
318 {
319         int i;
320
321         path->ipc_hinfo = &path->ipc_hinfo_area;
322         for (i = 0; i < ARRAY_SIZE(path->ipc_scratch); ++i)
323                 path->ipc_descr.ipd_key_scratch[i] =
324                         (struct iam_ikey *)&path->ipc_scratch[i];
325
326         iam_path_init(&path->ipc_path, &path->ipc_container, &path->ipc_descr);
327 }
328
329 void iam_path_compat_fini(struct iam_path_compat *path)
330 {
331         iam_path_fini(&path->ipc_path);
332 }
333
334 /*
335  * Helper function initializing iam_path_descr and its key scratch area.
336  */
337 struct iam_path_descr *iam_ipd_alloc(void *area, int keysize)
338 {
339         struct iam_path_descr *ipd;
340         void *karea;
341         int i;
342
343         ipd = area;
344         karea = ipd + 1;
345         for (i = 0; i < ARRAY_SIZE(ipd->ipd_key_scratch); ++i, karea += keysize)
346                 ipd->ipd_key_scratch[i] = karea;
347         return ipd;
348 }
349
350 void iam_ipd_free(struct iam_path_descr *ipd)
351 {
352 }
353
354 int iam_node_read(struct iam_container *c, iam_ptr_t ptr,
355                   handle_t *h, struct buffer_head **bh)
356 {
357         /*
358          * NB: it can be called by iam_lfix_guess() which is still at
359          * very early stage, c->ic_root_bh and c->ic_descr->id_ops
360          * haven't been intialized yet.
361          * Also, we don't have this for IAM dir.
362          */
363         if (c->ic_root_bh != NULL &&
364             c->ic_descr->id_ops->id_root_ptr(c) == ptr) {
365                 get_bh(c->ic_root_bh);
366                 *bh = c->ic_root_bh;
367                 return 0;
368         }
369
370         *bh = __ldiskfs_bread(h, c->ic_object, (int)ptr, 0);
371         if (IS_ERR(*bh))
372                 return PTR_ERR(*bh);
373
374         if (*bh == NULL)
375                 return -EIO;
376
377         return 0;
378 }
379
380 /*
381  * Return pointer to current leaf record. Pointer is valid while corresponding
382  * leaf node is locked and pinned.
383  */
384 static struct iam_rec *iam_leaf_rec(const struct iam_leaf *leaf)
385 {
386         return iam_leaf_ops(leaf)->rec(leaf);
387 }
388
389 /*
390  * Return pointer to the current leaf key. This function returns pointer to
391  * the key stored in node.
392  *
393  * Caller should assume that returned pointer is only valid while leaf node is
394  * pinned and locked.
395  */
396 static struct iam_key *iam_leaf_key(const struct iam_leaf *leaf)
397 {
398         return iam_leaf_ops(leaf)->key(leaf);
399 }
400
401 static int iam_leaf_key_size(const struct iam_leaf *leaf)
402 {
403         return iam_leaf_ops(leaf)->key_size(leaf);
404 }
405
406 static struct iam_ikey *iam_leaf_ikey(const struct iam_leaf *leaf,
407                                       struct iam_ikey *key)
408 {
409         return iam_leaf_ops(leaf)->ikey(leaf, key);
410 }
411
412 static int iam_leaf_keycmp(const struct iam_leaf *leaf,
413                            const struct iam_key *key)
414 {
415         return iam_leaf_ops(leaf)->key_cmp(leaf, key);
416 }
417
418 static int iam_leaf_keyeq(const struct iam_leaf *leaf,
419                           const struct iam_key *key)
420 {
421         return iam_leaf_ops(leaf)->key_eq(leaf, key);
422 }
423
424 #if LDISKFS_INVARIANT_ON
425 extern int dx_node_check(struct iam_path *p, struct iam_frame *f);
426
427 static int iam_path_check(struct iam_path *p)
428 {
429         int i;
430         int result;
431         struct iam_frame *f;
432         struct iam_descr *param;
433
434         result = 1;
435         param = iam_path_descr(p);
436         for (i = 0; result && i < ARRAY_SIZE(p->ip_frames); ++i) {
437                 f = &p->ip_frames[i];
438                 if (f->bh != NULL) {
439                         result = dx_node_check(p, f);
440                         if (result)
441                                 result = !param->id_ops->id_node_check(p, f);
442                 }
443         }
444         if (result && p->ip_leaf.il_bh != NULL)
445                 result = 1;
446         if (result == 0)
447                 ldiskfs_std_error(iam_path_obj(p)->i_sb, result);
448
449         return result;
450 }
451 #endif
452
453 static int iam_leaf_load(struct iam_path *path)
454 {
455         iam_ptr_t block;
456         int err;
457         struct iam_container *c;
458         struct buffer_head *bh;
459         struct iam_leaf *leaf;
460         struct iam_descr *descr;
461
462         c     = path->ip_container;
463         leaf  = &path->ip_leaf;
464         descr = iam_path_descr(path);
465         block = path->ip_frame->leaf;
466         if (block == 0) {
467                 /* XXX bug 11027 */
468                 printk(KERN_EMERG "wrong leaf: %lu %d [%p %p %p]\n",
469                        (long unsigned)path->ip_frame->leaf,
470                        dx_get_count(dx_node_get_entries(path, path->ip_frame)),
471                        path->ip_frames[0].bh, path->ip_frames[1].bh,
472                        path->ip_frames[2].bh);
473         }
474         err = descr->id_ops->id_node_read(c, block, NULL, &bh);
475         if (err == 0) {
476                 leaf->il_bh = bh;
477                 leaf->il_curidx = block;
478                 err = iam_leaf_ops(leaf)->init(leaf);
479         }
480         return err;
481 }
482
483 static void iam_unlock_htree(struct iam_container *ic,
484                              struct dynlock_handle *lh)
485 {
486         if (lh != NULL)
487                 dynlock_unlock(&ic->ic_tree_lock, lh);
488 }
489
490
491 static void iam_leaf_unlock(struct iam_leaf *leaf)
492 {
493         if (leaf->il_lock != NULL) {
494                 iam_unlock_htree(iam_leaf_container(leaf),
495                                  leaf->il_lock);
496                 do_corr(schedule());
497                 leaf->il_lock = NULL;
498         }
499 }
500
501 static void iam_leaf_fini(struct iam_leaf *leaf)
502 {
503         if (leaf->il_path != NULL) {
504                 iam_leaf_unlock(leaf);
505                 iam_leaf_ops(leaf)->fini(leaf);
506                 if (leaf->il_bh) {
507                         brelse(leaf->il_bh);
508                         leaf->il_bh = NULL;
509                         leaf->il_curidx = 0;
510                 }
511         }
512 }
513
514 static void iam_leaf_start(struct iam_leaf *folio)
515 {
516         iam_leaf_ops(folio)->start(folio);
517 }
518
519 void iam_leaf_next(struct iam_leaf *folio)
520 {
521         iam_leaf_ops(folio)->next(folio);
522 }
523
524 static void iam_leaf_rec_add(struct iam_leaf *leaf, const struct iam_key *key,
525                              const struct iam_rec *rec)
526 {
527         iam_leaf_ops(leaf)->rec_add(leaf, key, rec);
528 }
529
530 static void iam_rec_del(struct iam_leaf *leaf, int shift)
531 {
532         iam_leaf_ops(leaf)->rec_del(leaf, shift);
533 }
534
535 int iam_leaf_at_end(const struct iam_leaf *leaf)
536 {
537         return iam_leaf_ops(leaf)->at_end(leaf);
538 }
539
540 static void iam_leaf_split(struct iam_leaf *l, struct buffer_head **bh,
541                            iam_ptr_t nr)
542 {
543         iam_leaf_ops(l)->split(l, bh, nr);
544 }
545
546 static inline int iam_leaf_empty(struct iam_leaf *l)
547 {
548         return iam_leaf_ops(l)->leaf_empty(l);
549 }
550
551 int iam_leaf_can_add(const struct iam_leaf *l,
552                      const struct iam_key *k, const struct iam_rec *r)
553 {
554         return iam_leaf_ops(l)->can_add(l, k, r);
555 }
556
557 static int iam_txn_dirty(handle_t *handle,
558                          struct iam_path *path, struct buffer_head *bh)
559 {
560         int result;
561
562         result = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
563         if (result != 0)
564                 ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
565         return result;
566 }
567
568 static int iam_txn_add(handle_t *handle,
569                        struct iam_path *path, struct buffer_head *bh)
570 {
571         int result;
572
573         result = ldiskfs_journal_get_write_access(handle, bh);
574         if (result != 0)
575                 ldiskfs_std_error(iam_path_obj(path)->i_sb, result);
576         return result;
577 }
578
579 /***********************************************************************/
580 /* iterator interface                                                  */
581 /***********************************************************************/
582
583 static enum iam_it_state it_state(const struct iam_iterator *it)
584 {
585         return it->ii_state;
586 }
587
588 /*
589  * Helper function returning scratch key.
590  */
591 static struct iam_container *iam_it_container(const struct iam_iterator *it)
592 {
593         return it->ii_path.ip_container;
594 }
595
596 static inline int it_keycmp(const struct iam_iterator *it,
597                             const struct iam_key *k)
598 {
599         return iam_leaf_keycmp(&it->ii_path.ip_leaf, k);
600 }
601
602 static inline int it_keyeq(const struct iam_iterator *it,
603                            const struct iam_key *k)
604 {
605         return iam_leaf_keyeq(&it->ii_path.ip_leaf, k);
606 }
607
608 static int it_ikeycmp(const struct iam_iterator *it, const struct iam_ikey *ik)
609 {
610         return iam_ikeycmp(it->ii_path.ip_container,
611                            iam_leaf_ikey(&it->ii_path.ip_leaf,
612                                         iam_path_ikey(&it->ii_path, 0)), ik);
613 }
614
615 static inline int it_at_rec(const struct iam_iterator *it)
616 {
617         return !iam_leaf_at_end(&it->ii_path.ip_leaf);
618 }
619
620 static inline int it_before(const struct iam_iterator *it)
621 {
622         return it_state(it) == IAM_IT_SKEWED && it_at_rec(it);
623 }
624
625 /*
626  * Helper wrapper around iam_it_get(): returns 0 (success) only when record
627  * with exactly the same key as asked is found.
628  */
629 static int iam_it_get_exact(struct iam_iterator *it, const struct iam_key *k)
630 {
631         int result;
632
633         result = iam_it_get(it, k);
634         if (result > 0)
635                 result = 0;
636         else if (result == 0)
637                 /*
638                  * Return -ENOENT if cursor is located above record with a key
639                  * different from one specified, or in the empty leaf.
640                  *
641                  * XXX returning -ENOENT only works if iam_it_get() never
642                  * returns -ENOENT as a legitimate error.
643                  */
644                 result = -ENOENT;
645         return result;
646 }
647
648 void iam_container_write_lock(struct iam_container *ic)
649 {
650         down_write(&ic->ic_sem);
651 }
652
653 void iam_container_write_unlock(struct iam_container *ic)
654 {
655         up_write(&ic->ic_sem);
656 }
657
658 void iam_container_read_lock(struct iam_container *ic)
659 {
660         down_read(&ic->ic_sem);
661 }
662
663 void iam_container_read_unlock(struct iam_container *ic)
664 {
665         up_read(&ic->ic_sem);
666 }
667
668 /*
669  * Initialize iterator to IAM_IT_DETACHED state.
670  *
671  * postcondition: it_state(it) == IAM_IT_DETACHED
672  */
673 int  iam_it_init(struct iam_iterator *it, struct iam_container *c, __u32 flags,
674                  struct iam_path_descr *pd)
675 {
676         memset(it, 0, sizeof *it);
677         it->ii_flags  = flags;
678         it->ii_state  = IAM_IT_DETACHED;
679         iam_path_init(&it->ii_path, c, pd);
680         return 0;
681 }
682
683 /*
684  * Finalize iterator and release all resources.
685  *
686  * precondition: it_state(it) == IAM_IT_DETACHED
687  */
688 void iam_it_fini(struct iam_iterator *it)
689 {
690         assert_corr(it_state(it) == IAM_IT_DETACHED);
691         iam_path_fini(&it->ii_path);
692 }
693
694 /*
695  * this locking primitives are used to protect parts
696  * of dir's htree. protection unit is block: leaf or index
697  */
698 static struct dynlock_handle *iam_lock_htree(struct iam_container *ic,
699                                              unsigned long value,
700                                              enum dynlock_type lt)
701 {
702         return dynlock_lock(&ic->ic_tree_lock, value, lt, GFP_NOFS);
703 }
704
705 static int iam_index_lock(struct iam_path *path, struct dynlock_handle **lh)
706 {
707         struct iam_frame *f;
708
709         for (f = path->ip_frame; f >= path->ip_frames; --f, ++lh) {
710                 do_corr(schedule());
711                 *lh = iam_lock_htree(path->ip_container, f->curidx, DLT_READ);
712                 if (*lh == NULL)
713                         return -ENOMEM;
714         }
715         return 0;
716 }
717
718 /*
719  * Fast check for frame consistency.
720  */
721 static int iam_check_fast(struct iam_path *path, struct iam_frame *frame)
722 {
723         struct iam_container *bag;
724         struct iam_entry *next;
725         struct iam_entry *last;
726         struct iam_entry *entries;
727         struct iam_entry *at;
728
729         bag = path->ip_container;
730         at = frame->at;
731         entries = frame->entries;
732         last = iam_entry_shift(path, entries, dx_get_count(entries) - 1);
733
734         if (unlikely(at > last))
735                 return -EAGAIN;
736
737         if (unlikely(dx_get_block(path, at) != frame->leaf))
738                 return -EAGAIN;
739
740         if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, at),
741                      path->ip_ikey_target) > 0))
742                 return -EAGAIN;
743
744         next = iam_entry_shift(path, at, +1);
745         if (next <= last) {
746                 if (unlikely(iam_ikeycmp(bag, iam_ikey_at(path, next),
747                                          path->ip_ikey_target) <= 0))
748                         return -EAGAIN;
749         }
750         return 0;
751 }
752
753 int dx_index_is_compat(struct iam_path *path)
754 {
755         return iam_path_descr(path) == NULL;
756 }
757
758 /*
759  * dx_find_position
760  *
761  * search position of specified hash in index
762  *
763  */
764
765 static struct iam_entry *iam_find_position(struct iam_path *path,
766                                            struct iam_frame *frame)
767 {
768         int count;
769         struct iam_entry *p;
770         struct iam_entry *q;
771         struct iam_entry *m;
772
773         count = dx_get_count(frame->entries);
774         assert_corr(count && count <= dx_get_limit(frame->entries));
775         p = iam_entry_shift(path, frame->entries,
776                             dx_index_is_compat(path) ? 1 : 2);
777         q = iam_entry_shift(path, frame->entries, count - 1);
778         while (p <= q) {
779                 m = iam_entry_shift(path, p, iam_entry_diff(path, q, p) / 2);
780                 if (iam_ikeycmp(path->ip_container, iam_ikey_at(path, m),
781                                 path->ip_ikey_target) > 0)
782                         q = iam_entry_shift(path, m, -1);
783                 else
784                         p = iam_entry_shift(path, m, +1);
785         }
786         return iam_entry_shift(path, p, -1);
787 }
788
789
790
791 static iam_ptr_t iam_find_ptr(struct iam_path *path, struct iam_frame *frame)
792 {
793         return dx_get_block(path, iam_find_position(path, frame));
794 }
795
796 void iam_insert_key(struct iam_path *path, struct iam_frame *frame,
797                     const struct iam_ikey *key, iam_ptr_t ptr)
798 {
799         struct iam_entry *entries = frame->entries;
800         struct iam_entry *new = iam_entry_shift(path, frame->at, +1);
801         int count = dx_get_count(entries);
802
803         /*
804          * Unfortunately we cannot assert this, as this function is sometimes
805          * called by VFS under i_sem and without pdirops lock.
806          */
807         assert_corr(1 || iam_frame_is_locked(path, frame));
808         assert_corr(count < dx_get_limit(entries));
809         assert_corr(frame->at < iam_entry_shift(path, entries, count));
810         assert_inv(dx_node_check(path, frame));
811
812         memmove(iam_entry_shift(path, new, 1), new,
813                 (char *)iam_entry_shift(path, entries, count) - (char *)new);
814         dx_set_ikey(path, new, key);
815         dx_set_block(path, new, ptr);
816         dx_set_count(entries, count + 1);
817         assert_inv(dx_node_check(path, frame));
818 }
819
820 void iam_insert_key_lock(struct iam_path *path, struct iam_frame *frame,
821                          const struct iam_ikey *key, iam_ptr_t ptr)
822 {
823         iam_lock_bh(frame->bh);
824         iam_insert_key(path, frame, key, ptr);
825         iam_unlock_bh(frame->bh);
826 }
827 /*
828  * returns 0 if path was unchanged, -EAGAIN otherwise.
829  */
830 static int iam_check_path(struct iam_path *path, struct iam_frame *frame)
831 {
832         int equal;
833
834         iam_lock_bh(frame->bh);
835         equal = iam_check_fast(path, frame) == 0 ||
836                 frame->leaf == iam_find_ptr(path, frame);
837         DX_DEVAL(iam_lock_stats.dls_bh_again += !equal);
838         iam_unlock_bh(frame->bh);
839
840         return equal ? 0 : -EAGAIN;
841 }
842
843 static int iam_lookup_try(struct iam_path *path)
844 {
845         u32 ptr;
846         int err = 0;
847         int i;
848
849         struct iam_descr *param;
850         struct iam_frame *frame;
851         struct iam_container *c;
852
853         param = iam_path_descr(path);
854         c = path->ip_container;
855
856         ptr = param->id_ops->id_root_ptr(c);
857         for (frame = path->ip_frames, i = 0; i <= path->ip_indirect;
858              ++frame, ++i) {
859                 err = param->id_ops->id_node_read(c, (iam_ptr_t)ptr, NULL,
860                                                   &frame->bh);
861                 do_corr(schedule());
862
863                 iam_lock_bh(frame->bh);
864                 /*
865                  * node must be initialized under bh lock because concurrent
866                  * creation procedure may change it and iam_lookup_try() will
867                  * see obsolete tree height. -bzzz
868                  */
869                 if (err != 0)
870                         break;
871
872                 if (LDISKFS_INVARIANT_ON) {
873                         err = param->id_ops->id_node_check(path, frame);
874                         if (err != 0)
875                                 break;
876                 }
877
878                 err = param->id_ops->id_node_load(path, frame);
879                 if (err != 0)
880                         break;
881
882                 assert_inv(dx_node_check(path, frame));
883                 /*
884                  * splitting may change root index block and move hash we're
885                  * looking for into another index block so, we have to check
886                  * this situation and repeat from begining if path got changed
887                  * -bzzz
888                  */
889                 if (i > 0) {
890                         err = iam_check_path(path, frame - 1);
891                         if (err != 0)
892                                 break;
893                 }
894
895                 frame->at = iam_find_position(path, frame);
896                 frame->curidx = ptr;
897                 frame->leaf = ptr = dx_get_block(path, frame->at);
898
899                 iam_unlock_bh(frame->bh);
900                 do_corr(schedule());
901         }
902         if (err != 0)
903                 iam_unlock_bh(frame->bh);
904         path->ip_frame = --frame;
905         return err;
906 }
907
908 static int __iam_path_lookup(struct iam_path *path)
909 {
910         int err;
911         int i;
912
913         for (i = 0; i < DX_MAX_TREE_HEIGHT; ++ i)
914                 assert(path->ip_frames[i].bh == NULL);
915
916         do {
917                 err = iam_lookup_try(path);
918                 do_corr(schedule());
919                 if (err != 0)
920                         iam_path_fini(path);
921         } while (err == -EAGAIN);
922
923         return err;
924 }
925
926 /*
927  * returns 0 if path was unchanged, -EAGAIN otherwise.
928  */
929 static int iam_check_full_path(struct iam_path *path, int search)
930 {
931         struct iam_frame *bottom;
932         struct iam_frame *scan;
933         int i;
934         int result;
935
936         do_corr(schedule());
937
938         for (bottom = path->ip_frames, i = 0;
939              i < DX_MAX_TREE_HEIGHT && bottom->bh != NULL; ++bottom, ++i) {
940                 ; /* find last filled in frame */
941         }
942
943         /*
944          * Lock frames, bottom to top.
945          */
946         for (scan = bottom - 1; scan >= path->ip_frames; --scan)
947                 iam_lock_bh(scan->bh);
948         /*
949          * Check them top to bottom.
950          */
951         result = 0;
952         for (scan = path->ip_frames; scan < bottom; ++scan) {
953                 struct iam_entry *pos;
954
955                 if (search) {
956                         if (iam_check_fast(path, scan) == 0)
957                                 continue;
958
959                         pos = iam_find_position(path, scan);
960                         if (scan->leaf != dx_get_block(path, pos)) {
961                                 result = -EAGAIN;
962                                 break;
963                         }
964                         scan->at = pos;
965                 } else {
966                         pos = iam_entry_shift(path, scan->entries,
967                                               dx_get_count(scan->entries) - 1);
968                         if (scan->at > pos ||
969                             scan->leaf != dx_get_block(path, scan->at)) {
970                                 result = -EAGAIN;
971                                 break;
972                         }
973                 }
974         }
975
976         /*
977          * Unlock top to bottom.
978          */
979         for (scan = path->ip_frames; scan < bottom; ++scan)
980                 iam_unlock_bh(scan->bh);
981         DX_DEVAL(iam_lock_stats.dls_bh_full_again += !!result);
982         do_corr(schedule());
983
984         return result;
985 }
986
987
988 /*
989  * Performs path lookup and returns with found leaf (if any) locked by htree
990  * lock.
991  */
992 static int iam_lookup_lock(struct iam_path *path,
993                            struct dynlock_handle **dl, enum dynlock_type lt)
994 {
995         int result;
996
997         while ((result = __iam_path_lookup(path)) == 0) {
998                 do_corr(schedule());
999                 *dl = iam_lock_htree(path->ip_container, path->ip_frame->leaf,
1000                                      lt);
1001                 if (*dl == NULL) {
1002                         iam_path_fini(path);
1003                         result = -ENOMEM;
1004                         break;
1005                 }
1006                 do_corr(schedule());
1007                 /*
1008                  * while locking leaf we just found may get split so we need
1009                  * to check this -bzzz
1010                  */
1011                 if (iam_check_full_path(path, 1) == 0)
1012                         break;
1013                 iam_unlock_htree(path->ip_container, *dl);
1014                 *dl = NULL;
1015                 iam_path_fini(path);
1016         }
1017         return result;
1018 }
1019 /*
1020  * Performs tree top-to-bottom traversal starting from root, and loads leaf
1021  * node.
1022  */
1023 static int iam_path_lookup(struct iam_path *path, int index)
1024 {
1025         struct iam_leaf  *leaf;
1026         int result;
1027
1028         leaf = &path->ip_leaf;
1029         result = iam_lookup_lock(path, &leaf->il_lock, DLT_WRITE);
1030         assert_inv(iam_path_check(path));
1031         do_corr(schedule());
1032         if (result == 0) {
1033                 result = iam_leaf_load(path);
1034                 if (result == 0) {
1035                         do_corr(schedule());
1036                         if (index)
1037                                 result = iam_leaf_ops(leaf)->
1038                                         ilookup(leaf, path->ip_ikey_target);
1039                         else
1040                                 result = iam_leaf_ops(leaf)->
1041                                         lookup(leaf, path->ip_key_target);
1042                         do_corr(schedule());
1043                 }
1044                 if (result < 0)
1045                         iam_leaf_unlock(leaf);
1046         }
1047         return result;
1048 }
1049
1050 /*
1051  * Common part of iam_it_{i,}get().
1052  */
1053 static int __iam_it_get(struct iam_iterator *it, int index)
1054 {
1055         int result;
1056
1057         assert_corr(it_state(it) == IAM_IT_DETACHED);
1058
1059         result = iam_path_lookup(&it->ii_path, index);
1060         if (result >= 0) {
1061                 int collision;
1062
1063                 collision = result & IAM_LOOKUP_LAST;
1064                 switch (result & ~IAM_LOOKUP_LAST) {
1065                 case IAM_LOOKUP_EXACT:
1066                         result = +1;
1067                         it->ii_state = IAM_IT_ATTACHED;
1068                         break;
1069                 case IAM_LOOKUP_OK:
1070                         result = 0;
1071                         it->ii_state = IAM_IT_ATTACHED;
1072                         break;
1073                 case IAM_LOOKUP_BEFORE:
1074                 case IAM_LOOKUP_EMPTY:
1075                         result = 0;
1076                         it->ii_state = IAM_IT_SKEWED;
1077                         break;
1078                 default:
1079                         assert(0);
1080                 }
1081                 result |= collision;
1082         }
1083         /*
1084          * See iam_it_get_exact() for explanation.
1085          */
1086         assert_corr(result != -ENOENT);
1087         return result;
1088 }
1089
1090 /*
1091  * Correct hash, but not the same key was found, iterate through hash
1092  * collision chain, looking for correct record.
1093  */
1094 static int iam_it_collision(struct iam_iterator *it)
1095 {
1096         int result;
1097
1098         assert(ergo(it_at_rec(it), !it_keyeq(it, it->ii_path.ip_key_target)));
1099
1100         while ((result = iam_it_next(it)) == 0) {
1101                 do_corr(schedule());
1102                 if (it_ikeycmp(it, it->ii_path.ip_ikey_target) != 0)
1103                         return -ENOENT;
1104                 if (it_keyeq(it, it->ii_path.ip_key_target))
1105                         return 0;
1106         }
1107         return result;
1108 }
1109
1110 /*
1111  * Attach iterator. After successful completion, @it points to record with
1112  * least key not larger than @k.
1113  *
1114  * Return value: 0: positioned on existing record,
1115  *             +ve: exact position found,
1116  *             -ve: error.
1117  *
1118  * precondition:  it_state(it) == IAM_IT_DETACHED
1119  * postcondition: ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
1120  *                     it_keycmp(it, k) <= 0)
1121  */
1122 int iam_it_get(struct iam_iterator *it, const struct iam_key *k)
1123 {
1124         int result;
1125
1126         assert_corr(it_state(it) == IAM_IT_DETACHED);
1127
1128         it->ii_path.ip_ikey_target = NULL;
1129         it->ii_path.ip_key_target  = k;
1130
1131         result = __iam_it_get(it, 0);
1132
1133         if (result == IAM_LOOKUP_LAST) {
1134                 result = iam_it_collision(it);
1135                 if (result != 0) {
1136                         iam_it_put(it);
1137                         iam_it_fini(it);
1138                         result = __iam_it_get(it, 0);
1139                 } else
1140                         result = +1;
1141         }
1142         if (result > 0)
1143                 result &= ~IAM_LOOKUP_LAST;
1144
1145         assert_corr(ergo(result > 0, it_keycmp(it, k) == 0));
1146         assert_corr(ergo(result == 0 && it_state(it) == IAM_IT_ATTACHED,
1147                     it_keycmp(it, k) <= 0));
1148         return result;
1149 }
1150
1151 /*
1152  * Attach iterator by index key.
1153  */
1154 static int iam_it_iget(struct iam_iterator *it, const struct iam_ikey *k)
1155 {
1156         assert_corr(it_state(it) == IAM_IT_DETACHED);
1157
1158         it->ii_path.ip_ikey_target = k;
1159         return __iam_it_get(it, 1) & ~IAM_LOOKUP_LAST;
1160 }
1161
1162 /*
1163  * Attach iterator, and assure it points to the record (not skewed).
1164  *
1165  * Return value: 0: positioned on existing record,
1166  *             +ve: exact position found,
1167  *             -ve: error.
1168  *
1169  * precondition:  it_state(it) == IAM_IT_DETACHED &&
1170  *                !(it->ii_flags&IAM_IT_WRITE)
1171  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)
1172  */
1173 int iam_it_get_at(struct iam_iterator *it, const struct iam_key *k)
1174 {
1175         int result;
1176
1177         assert_corr(it_state(it) == IAM_IT_DETACHED &&
1178                     !(it->ii_flags&IAM_IT_WRITE));
1179         result = iam_it_get(it, k);
1180         if (result == 0) {
1181                 if (it_state(it) != IAM_IT_ATTACHED) {
1182                         assert_corr(it_state(it) == IAM_IT_SKEWED);
1183                         result = iam_it_next(it);
1184                 }
1185         }
1186         assert_corr(ergo(result >= 0, it_state(it) == IAM_IT_ATTACHED));
1187         return result;
1188 }
1189
1190 /*
1191  * Duplicates iterator.
1192  *
1193  * postcondition: it_state(dst) == it_state(src) &&
1194  *                iam_it_container(dst) == iam_it_container(src) &&
1195  *                dst->ii_flags = src->ii_flags &&
1196  *                ergo(it_state(src) == IAM_IT_ATTACHED,
1197  *                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
1198  *                     iam_it_key_get(dst) == iam_it_key_get(src))
1199  */
1200 void iam_it_dup(struct iam_iterator *dst, const struct iam_iterator *src)
1201 {
1202         dst->ii_flags = src->ii_flags;
1203         dst->ii_state = src->ii_state;
1204         /* XXX not yet. iam_path_dup(&dst->ii_path, &src->ii_path); */
1205         /*
1206          * XXX: duplicate lock.
1207          */
1208         assert_corr(it_state(dst) == it_state(src));
1209         assert_corr(iam_it_container(dst) == iam_it_container(src));
1210         assert_corr(dst->ii_flags = src->ii_flags);
1211         assert_corr(ergo(it_state(src) == IAM_IT_ATTACHED,
1212                     iam_it_rec_get(dst) == iam_it_rec_get(src) &&
1213                     iam_it_key_get(dst) == iam_it_key_get(src)));
1214 }
1215
1216 /*
1217  * Detach iterator. Does nothing it detached state.
1218  *
1219  * postcondition: it_state(it) == IAM_IT_DETACHED
1220  */
1221 void iam_it_put(struct iam_iterator *it)
1222 {
1223         if (it->ii_state != IAM_IT_DETACHED) {
1224                 it->ii_state = IAM_IT_DETACHED;
1225                 iam_leaf_fini(&it->ii_path.ip_leaf);
1226         }
1227 }
1228
1229 static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
1230                                         struct iam_ikey *ikey);
1231
1232
1233 /*
1234  * This function increments the frame pointer to search the next leaf
1235  * block, and reads in the necessary intervening nodes if the search
1236  * should be necessary.  Whether or not the search is necessary is
1237  * controlled by the hash parameter.  If the hash value is even, then
1238  * the search is only continued if the next block starts with that
1239  * hash value.  This is used if we are searching for a specific file.
1240  *
1241  * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
1242  *
1243  * This function returns 1 if the caller should continue to search,
1244  * or 0 if it should not.  If there is an error reading one of the
1245  * index blocks, it will a negative error code.
1246  *
1247  * If start_hash is non-null, it will be filled in with the starting
1248  * hash of the next page.
1249  */
1250 static int iam_htree_advance(struct inode *dir, __u32 hash,
1251                               struct iam_path *path, __u32 *start_hash,
1252                               int compat)
1253 {
1254         struct iam_frame *p;
1255         struct buffer_head *bh;
1256         int err, num_frames = 0;
1257         __u32 bhash;
1258
1259         p = path->ip_frame;
1260         /*
1261          * Find the next leaf page by incrementing the frame pointer.
1262          * If we run out of entries in the interior node, loop around and
1263          * increment pointer in the parent node.  When we break out of
1264          * this loop, num_frames indicates the number of interior
1265          * nodes need to be read.
1266          */
1267         while (1) {
1268                 do_corr(schedule());
1269                 iam_lock_bh(p->bh);
1270                 if (p->at_shifted)
1271                         p->at_shifted = 0;
1272                 else
1273                         p->at = iam_entry_shift(path, p->at, +1);
1274                 if (p->at < iam_entry_shift(path, p->entries,
1275                                             dx_get_count(p->entries))) {
1276                         p->leaf = dx_get_block(path, p->at);
1277                         iam_unlock_bh(p->bh);
1278                         break;
1279                 }
1280                 iam_unlock_bh(p->bh);
1281                 if (p == path->ip_frames)
1282                         return 0;
1283                 num_frames++;
1284                 --p;
1285         }
1286
1287         if (compat) {
1288                 /*
1289                  * Htree hash magic.
1290                  */
1291
1292                 /*
1293                  * If the hash is 1, then continue only if the next page has a
1294                  * continuation hash of any value.  This is used for readdir
1295                  * handling.  Otherwise, check to see if the hash matches the
1296                  * desired contiuation hash.  If it doesn't, return since
1297                  * there's no point to read in the successive index pages.
1298                  */
1299                 dx_get_ikey(path, p->at, (struct iam_ikey *)&bhash);
1300                 if (start_hash)
1301                         *start_hash = bhash;
1302                 if ((hash & 1) == 0) {
1303                         if ((bhash & ~1) != hash)
1304                                 return 0;
1305                 }
1306         }
1307         /*
1308          * If the hash is HASH_NB_ALWAYS, we always go to the next
1309          * block so no check is necessary
1310          */
1311         while (num_frames--) {
1312                 iam_ptr_t idx;
1313
1314                 do_corr(schedule());
1315                 iam_lock_bh(p->bh);
1316                 idx = p->leaf = dx_get_block(path, p->at);
1317                 iam_unlock_bh(p->bh);
1318                 err = iam_path_descr(path)->id_ops->
1319                         id_node_read(path->ip_container, idx, NULL, &bh);
1320                 if (err != 0)
1321                         return err; /* Failure */
1322                 ++p;
1323                 brelse(p->bh);
1324                 assert_corr(p->bh != bh);
1325                 p->bh = bh;
1326                 p->entries = dx_node_get_entries(path, p);
1327                 p->at = iam_entry_shift(path, p->entries, !compat);
1328                 assert_corr(p->curidx != idx);
1329                 p->curidx = idx;
1330                 iam_lock_bh(p->bh);
1331                 assert_corr(p->leaf != dx_get_block(path, p->at));
1332                 p->leaf = dx_get_block(path, p->at);
1333                 iam_unlock_bh(p->bh);
1334                 assert_inv(dx_node_check(path, p));
1335         }
1336         return 1;
1337 }
1338
1339 static inline int iam_index_advance(struct iam_path *path)
1340 {
1341         return iam_htree_advance(iam_path_obj(path), 0, path, NULL, 0);
1342 }
1343
1344 static void iam_unlock_array(struct iam_container *ic,
1345                              struct dynlock_handle **lh)
1346 {
1347         int i;
1348
1349         for (i = 0; i < DX_MAX_TREE_HEIGHT; ++i, ++lh) {
1350                 if (*lh != NULL) {
1351                         iam_unlock_htree(ic, *lh);
1352                         *lh = NULL;
1353                 }
1354         }
1355 }
1356 /*
1357  * Advance index part of @path to point to the next leaf. Returns 1 on
1358  * success, 0, when end of container was reached. Leaf node is locked.
1359  */
1360 int iam_index_next(struct iam_container *c, struct iam_path *path)
1361 {
1362         iam_ptr_t cursor;
1363         struct dynlock_handle *lh[DX_MAX_TREE_HEIGHT] = { NULL, };
1364         int result;
1365
1366         /*
1367          * Locking for iam_index_next()... is to be described.
1368          */
1369
1370         cursor = path->ip_frame->leaf;
1371
1372         while (1) {
1373                 result = iam_index_lock(path, lh);
1374                 do_corr(schedule());
1375                 if (result < 0)
1376                         break;
1377
1378                 result = iam_check_full_path(path, 0);
1379                 if (result == 0 && cursor == path->ip_frame->leaf) {
1380                         result = iam_index_advance(path);
1381
1382                         assert_corr(result == 0 ||
1383                                     cursor != path->ip_frame->leaf);
1384                         break;
1385                 }
1386                 do {
1387                         iam_unlock_array(c, lh);
1388
1389                         iam_path_release(path);
1390                         do_corr(schedule());
1391
1392                         result = __iam_path_lookup(path);
1393                         if (result < 0)
1394                                 break;
1395
1396                         while (path->ip_frame->leaf != cursor) {
1397                                 do_corr(schedule());
1398
1399                                 result = iam_index_lock(path, lh);
1400                                 do_corr(schedule());
1401                                 if (result < 0)
1402                                         break;
1403
1404                                 result = iam_check_full_path(path, 0);
1405                                 if (result != 0)
1406                                         break;
1407
1408                                 result = iam_index_advance(path);
1409                                 if (result == 0) {
1410                                         CERROR("cannot find cursor : %u\n",
1411                                                 cursor);
1412                                         result = -EIO;
1413                                 }
1414                                 if (result < 0)
1415                                         break;
1416                                 result = iam_check_full_path(path, 0);
1417                                 if (result != 0)
1418                                         break;
1419                                 iam_unlock_array(c, lh);
1420                         }
1421                 } while (result == -EAGAIN);
1422                 if (result < 0)
1423                         break;
1424         }
1425         iam_unlock_array(c, lh);
1426         return result;
1427 }
1428
1429 /*
1430  * Move iterator one record right.
1431  *
1432  * Return value: 0: success,
1433  *              +1: end of container reached
1434  *             -ve: error
1435  *
1436  * precondition:  (it_state(it) == IAM_IT_ATTACHED ||
1437  *                 it_state(it) == IAM_IT_SKEWED) && it->ii_flags&IAM_IT_MOVE
1438  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED) &&
1439  *                ergo(result >  0, it_state(it) == IAM_IT_DETACHED)
1440  */
1441 int iam_it_next(struct iam_iterator *it)
1442 {
1443         int result;
1444         struct iam_path *path;
1445         struct iam_leaf *leaf;
1446
1447         do_corr(struct iam_ikey *ik_orig);
1448
1449         /* assert_corr(it->ii_flags&IAM_IT_MOVE); */
1450         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1451                     it_state(it) == IAM_IT_SKEWED);
1452
1453         path = &it->ii_path;
1454         leaf = &path->ip_leaf;
1455
1456         assert_corr(iam_leaf_is_locked(leaf));
1457
1458         result = 0;
1459         do_corr(ik_orig = it_at_rec(it) ?
1460                 iam_it_ikey_get(it, iam_path_ikey(path, 2)) : NULL);
1461         if (it_before(it)) {
1462                 assert_corr(!iam_leaf_at_end(leaf));
1463                 it->ii_state = IAM_IT_ATTACHED;
1464         } else {
1465                 if (!iam_leaf_at_end(leaf))
1466                         /* advance within leaf node */
1467                         iam_leaf_next(leaf);
1468                 /*
1469                  * multiple iterations may be necessary due to empty leaves.
1470                  */
1471                 while (result == 0 && iam_leaf_at_end(leaf)) {
1472                         do_corr(schedule());
1473                         /* advance index portion of the path */
1474                         result = iam_index_next(iam_it_container(it), path);
1475                         assert_corr(iam_leaf_is_locked(leaf));
1476                         if (result == 1) {
1477                                 struct dynlock_handle *lh;
1478                                 lh = iam_lock_htree(iam_it_container(it),
1479                                                     path->ip_frame->leaf,
1480                                                     DLT_WRITE);
1481                                 if (lh != NULL) {
1482                                         iam_leaf_fini(leaf);
1483                                         leaf->il_lock = lh;
1484                                         result = iam_leaf_load(path);
1485                                         if (result == 0)
1486                                                 iam_leaf_start(leaf);
1487                                 } else
1488                                         result = -ENOMEM;
1489                         } else if (result == 0)
1490                                 /* end of container reached */
1491                                 result = +1;
1492                         if (result != 0)
1493                                 iam_it_put(it);
1494                 }
1495                 if (result == 0)
1496                         it->ii_state = IAM_IT_ATTACHED;
1497         }
1498         assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED));
1499         assert_corr(ergo(result >  0, it_state(it) == IAM_IT_DETACHED));
1500         assert_corr(ergo(result == 0 && ik_orig != NULL,
1501                     it_ikeycmp(it, ik_orig) >= 0));
1502         return result;
1503 }
1504
1505 /*
1506  * Return pointer to the record under iterator.
1507  *
1508  * precondition:  it_state(it) == IAM_IT_ATTACHED && it_at_rec(it)
1509  * postcondition: it_state(it) == IAM_IT_ATTACHED
1510  */
1511 struct iam_rec *iam_it_rec_get(const struct iam_iterator *it)
1512 {
1513         assert_corr(it_state(it) == IAM_IT_ATTACHED);
1514         assert_corr(it_at_rec(it));
1515         return iam_leaf_rec(&it->ii_path.ip_leaf);
1516 }
1517
1518 static void iam_it_reccpy(struct iam_iterator *it, const struct iam_rec *r)
1519 {
1520         struct iam_leaf *folio;
1521
1522         folio = &it->ii_path.ip_leaf;
1523         iam_leaf_ops(folio)->rec_set(folio, r);
1524 }
1525
1526 /*
1527  * Replace contents of record under iterator.
1528  *
1529  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
1530  *                it->ii_flags&IAM_IT_WRITE
1531  * postcondition: it_state(it) == IAM_IT_ATTACHED &&
1532  *                ergo(result == 0, !memcmp(iam_it_rec_get(it), r, ...))
1533  */
1534 int iam_it_rec_set(handle_t *h,
1535                    struct iam_iterator *it, const struct iam_rec *r)
1536 {
1537         int result;
1538         struct iam_path *path;
1539         struct buffer_head *bh;
1540
1541         assert_corr(it_state(it) == IAM_IT_ATTACHED &&
1542                     it->ii_flags&IAM_IT_WRITE);
1543         assert_corr(it_at_rec(it));
1544
1545         path = &it->ii_path;
1546         bh = path->ip_leaf.il_bh;
1547         result = iam_txn_add(h, path, bh);
1548         if (result == 0) {
1549                 iam_it_reccpy(it, r);
1550                 result = iam_txn_dirty(h, path, bh);
1551         }
1552         return result;
1553 }
1554
1555 /*
1556  * Return pointer to the index key under iterator.
1557  *
1558  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1559  *                it_state(it) == IAM_IT_SKEWED
1560  */
1561 static struct iam_ikey *iam_it_ikey_get(const struct iam_iterator *it,
1562                                         struct iam_ikey *ikey)
1563 {
1564         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1565                     it_state(it) == IAM_IT_SKEWED);
1566         assert_corr(it_at_rec(it));
1567         return iam_leaf_ikey(&it->ii_path.ip_leaf, ikey);
1568 }
1569
1570 /*
1571  * Return pointer to the key under iterator.
1572  *
1573  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1574  *                it_state(it) == IAM_IT_SKEWED
1575  */
1576 struct iam_key *iam_it_key_get(const struct iam_iterator *it)
1577 {
1578         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1579                     it_state(it) == IAM_IT_SKEWED);
1580         assert_corr(it_at_rec(it));
1581         return iam_leaf_key(&it->ii_path.ip_leaf);
1582 }
1583
1584 /*
1585  * Return size of key under iterator (in bytes)
1586  *
1587  * precondition:  it_state(it) == IAM_IT_ATTACHED ||
1588  *                it_state(it) == IAM_IT_SKEWED
1589  */
1590 int iam_it_key_size(const struct iam_iterator *it)
1591 {
1592         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
1593                     it_state(it) == IAM_IT_SKEWED);
1594         assert_corr(it_at_rec(it));
1595         return iam_leaf_key_size(&it->ii_path.ip_leaf);
1596 }
1597
1598 static struct buffer_head *
1599 iam_new_node(handle_t *h, struct iam_container *c, iam_ptr_t *b, int *e)
1600 {
1601         struct inode *inode = c->ic_object;
1602         struct buffer_head *bh = NULL;
1603         struct iam_idle_head *head;
1604         struct buffer_head *idle;
1605         __u32 *idle_blocks;
1606         __u16 count;
1607
1608         if (c->ic_idle_bh == NULL)
1609                 goto newblock;
1610
1611         mutex_lock(&c->ic_idle_mutex);
1612         if (unlikely(c->ic_idle_bh == NULL)) {
1613                 mutex_unlock(&c->ic_idle_mutex);
1614                 goto newblock;
1615         }
1616
1617         head = (struct iam_idle_head *)(c->ic_idle_bh->b_data);
1618         count = le16_to_cpu(head->iih_count);
1619         if (count > 0) {
1620                 *e = ldiskfs_journal_get_write_access(h, c->ic_idle_bh);
1621                 if (*e != 0)
1622                         goto fail;
1623
1624                 --count;
1625                 *b = le32_to_cpu(head->iih_blks[count]);
1626                 head->iih_count = cpu_to_le16(count);
1627                 *e = ldiskfs_handle_dirty_metadata(h, inode, c->ic_idle_bh);
1628                 if (*e != 0)
1629                         goto fail;
1630
1631                 mutex_unlock(&c->ic_idle_mutex);
1632                 bh = __ldiskfs_bread(NULL, inode, *b, 0);
1633                 if (IS_ERR_OR_NULL(bh)) {
1634                         if (IS_ERR(bh))
1635                                 *e = PTR_ERR(bh);
1636                         else
1637                                 *e = -EIO;
1638                         return NULL;
1639                 }
1640                 goto got;
1641         }
1642
1643         /* The block itself which contains the iam_idle_head is
1644          * also an idle block, and can be used as the new node. */
1645         idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
1646                                 c->ic_descr->id_root_gap +
1647                                 sizeof(struct dx_countlimit));
1648         *e = ldiskfs_journal_get_write_access(h, c->ic_root_bh);
1649         if (*e != 0)
1650                 goto fail;
1651
1652         *b = le32_to_cpu(*idle_blocks);
1653         iam_lock_bh(c->ic_root_bh);
1654         *idle_blocks = head->iih_next;
1655         iam_unlock_bh(c->ic_root_bh);
1656         *e = ldiskfs_handle_dirty_metadata(h, inode, c->ic_root_bh);
1657         if (*e != 0) {
1658                 iam_lock_bh(c->ic_root_bh);
1659                 *idle_blocks = cpu_to_le32(*b);
1660                 iam_unlock_bh(c->ic_root_bh);
1661                 goto fail;
1662         }
1663
1664         bh = c->ic_idle_bh;
1665         idle = iam_load_idle_blocks(c, le32_to_cpu(*idle_blocks));
1666         if (idle != NULL && IS_ERR(idle)) {
1667                 *e = PTR_ERR(idle);
1668                 c->ic_idle_bh = NULL;
1669                 brelse(bh);
1670                 goto fail;
1671         }
1672
1673         c->ic_idle_bh = idle;
1674         mutex_unlock(&c->ic_idle_mutex);
1675
1676 got:
1677         /* get write access for the found buffer head */
1678         *e = ldiskfs_journal_get_write_access(h, bh);
1679         if (*e != 0) {
1680                 brelse(bh);
1681                 bh = NULL;
1682                 ldiskfs_std_error(inode->i_sb, *e);
1683         } else {
1684                 /* Clear the reused node as new node does. */
1685                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1686                 set_buffer_uptodate(bh);
1687         }
1688         return bh;
1689
1690 newblock:
1691         bh = osd_ldiskfs_append(h, inode, b);
1692         if (IS_ERR(bh)) {
1693                 *e = PTR_ERR(bh);
1694                 bh = NULL;
1695         }
1696
1697         return bh;
1698
1699 fail:
1700         mutex_unlock(&c->ic_idle_mutex);
1701         ldiskfs_std_error(inode->i_sb, *e);
1702         return NULL;
1703 }
1704
1705 /*
1706  * Insertion of new record. Interaction with jbd during non-trivial case (when
1707  * split happens) is as following:
1708  *
1709  *  - new leaf node is involved into transaction by iam_new_node();
1710  *
1711  *  - old leaf node is involved into transaction by iam_add_rec();
1712  *
1713  *  - leaf where insertion point ends in, is marked dirty by iam_add_rec();
1714  *
1715  *  - leaf without insertion point is marked dirty (as @new_leaf) by
1716  *  iam_new_leaf();
1717  *
1718  *  - split index nodes are involved into transaction and marked dirty by
1719  *  split_index_node().
1720  *
1721  *  - "safe" index node, which is no split, but where new pointer is inserted
1722  *  is involved into transaction and marked dirty by split_index_node().
1723  *
1724  *  - index node where pointer to new leaf is inserted is involved into
1725  *  transaction by split_index_node() and marked dirty by iam_add_rec().
1726  *
1727  *  - inode is marked dirty by iam_add_rec().
1728  *
1729  */
1730
1731 static int iam_new_leaf(handle_t *handle, struct iam_leaf *leaf)
1732 {
1733         int err;
1734         iam_ptr_t blknr;
1735         struct buffer_head *new_leaf;
1736         struct buffer_head *old_leaf;
1737         struct iam_container *c;
1738         struct inode *obj;
1739         struct iam_path *path;
1740
1741         c = iam_leaf_container(leaf);
1742         path = leaf->il_path;
1743
1744         obj = c->ic_object;
1745         new_leaf = iam_new_node(handle, c, &blknr, &err);
1746         do_corr(schedule());
1747         if (new_leaf != NULL) {
1748                 struct dynlock_handle *lh;
1749
1750                 lh = iam_lock_htree(c, blknr, DLT_WRITE);
1751                 do_corr(schedule());
1752                 if (lh != NULL) {
1753                         iam_leaf_ops(leaf)->init_new(c, new_leaf);
1754                         do_corr(schedule());
1755                         old_leaf = leaf->il_bh;
1756                         iam_leaf_split(leaf, &new_leaf, blknr);
1757                         if (old_leaf != leaf->il_bh) {
1758                                 /*
1759                                  * Switched to the new leaf.
1760                                  */
1761                                 iam_leaf_unlock(leaf);
1762                                 leaf->il_lock = lh;
1763                                 path->ip_frame->leaf = blknr;
1764                         } else
1765                                 iam_unlock_htree(path->ip_container, lh);
1766                         do_corr(schedule());
1767                         err = iam_txn_dirty(handle, path, new_leaf);
1768                         if (err == 0)
1769                                 err = ldiskfs_mark_inode_dirty(handle, obj);
1770                         do_corr(schedule());
1771                 } else
1772                         err = -ENOMEM;
1773                 brelse(new_leaf);
1774         }
1775         assert_inv(iam_path_check(iam_leaf_path(leaf)));
1776         return err;
1777 }
1778
1779 static inline void dx_set_limit(struct iam_entry *entries, unsigned value)
1780 {
1781         ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
1782 }
1783
1784 static int iam_shift_entries(struct iam_path *path,
1785                          struct iam_frame *frame, unsigned count,
1786                          struct iam_entry *entries, struct iam_entry *entries2,
1787                          u32 newblock)
1788 {
1789         unsigned count1;
1790         unsigned count2;
1791         int delta;
1792
1793         struct iam_frame *parent = frame - 1;
1794         struct iam_ikey *pivot = iam_path_ikey(path, 3);
1795
1796         delta = dx_index_is_compat(path) ? 0 : +1;
1797
1798         count1 = count/2 + delta;
1799         count2 = count - count1;
1800         dx_get_ikey(path, iam_entry_shift(path, entries, count1), pivot);
1801
1802         dxtrace(printk("Split index %d/%d\n", count1, count2));
1803
1804         memcpy((char *) iam_entry_shift(path, entries2, delta),
1805                (char *) iam_entry_shift(path, entries, count1),
1806                count2 * iam_entry_size(path));
1807
1808         dx_set_count(entries2, count2 + delta);
1809         dx_set_limit(entries2, dx_node_limit(path));
1810
1811         /*
1812          * NOTE: very subtle piece of code competing dx_probe() may find 2nd
1813          * level index in root index, then we insert new index here and set
1814          * new count in that 2nd level index. so, dx_probe() may see 2nd level
1815          * index w/o hash it looks for. the solution is to check root index
1816          * after we locked just founded 2nd level index -bzzz
1817          */
1818         iam_insert_key_lock(path, parent, pivot, newblock);
1819
1820         /*
1821          * now old and new 2nd level index blocks contain all pointers, so
1822          * dx_probe() may find it in the both.  it's OK -bzzz
1823          */
1824         iam_lock_bh(frame->bh);
1825         dx_set_count(entries, count1);
1826         iam_unlock_bh(frame->bh);
1827
1828         /*
1829          * now old 2nd level index block points to first half of leafs. it's
1830          * importand that dx_probe() must check root index block for changes
1831          * under dx_lock_bh(frame->bh) -bzzz
1832          */
1833
1834         return count1;
1835 }
1836
1837
1838 int split_index_node(handle_t *handle, struct iam_path *path,
1839                      struct dynlock_handle **lh)
1840 {
1841         struct iam_entry *entries;   /* old block contents */
1842         struct iam_entry *entries2;  /* new block contents */
1843         struct iam_frame *frame, *safe;
1844         struct buffer_head *bh_new[DX_MAX_TREE_HEIGHT] = {NULL};
1845         u32 newblock[DX_MAX_TREE_HEIGHT] = {0};
1846         struct dynlock_handle *lock[DX_MAX_TREE_HEIGHT] = {NULL,};
1847         struct dynlock_handle *new_lock[DX_MAX_TREE_HEIGHT] = {NULL,};
1848         struct inode *dir = iam_path_obj(path);
1849         struct iam_descr *descr;
1850         int nr_splet;
1851         int i, err;
1852
1853         descr = iam_path_descr(path);
1854         /*
1855          * Algorithm below depends on this.
1856          */
1857         assert_corr(dx_root_limit(path) < dx_node_limit(path));
1858
1859         frame = path->ip_frame;
1860         entries = frame->entries;
1861
1862         /*
1863          * Tall-tree handling: we might have to split multiple index blocks
1864          * all the way up to tree root. Tricky point here is error handling:
1865          * to avoid complicated undo/rollback we
1866          *
1867          *   - first allocate all necessary blocks
1868          *
1869          *   - insert pointers into them atomically.
1870          */
1871
1872         /*
1873          * Locking: leaf is already locked. htree-locks are acquired on all
1874          * index nodes that require split bottom-to-top, on the "safe" node,
1875          * and on all new nodes
1876          */
1877
1878         dxtrace(printk("using %u of %u node entries\n",
1879                        dx_get_count(entries), dx_get_limit(entries)));
1880
1881         /* What levels need split? */
1882         for (nr_splet = 0; frame >= path->ip_frames &&
1883              dx_get_count(frame->entries) == dx_get_limit(frame->entries);
1884              --frame, ++nr_splet) {
1885                 do_corr(schedule());
1886                 if (nr_splet == DX_MAX_TREE_HEIGHT) {
1887                         /*
1888                          * CWARN(dir->i_sb, __FUNCTION__,
1889                          * "Directory index full!\n");
1890                          */
1891                         err = -ENOSPC;
1892                         goto cleanup;
1893                 }
1894         }
1895
1896         safe = frame;
1897
1898         /*
1899          * Lock all nodes, bottom to top.
1900          */
1901         for (frame = path->ip_frame, i = nr_splet; i >= 0; --i, --frame) {
1902                 do_corr(schedule());
1903                 lock[i] = iam_lock_htree(path->ip_container, frame->curidx,
1904                                          DLT_WRITE);
1905                 if (lock[i] == NULL) {
1906                         err = -ENOMEM;
1907                         goto cleanup;
1908                 }
1909         }
1910
1911         /*
1912          * Check for concurrent index modification.
1913          */
1914         err = iam_check_full_path(path, 1);
1915         if (err)
1916                 goto cleanup;
1917         /*
1918          * And check that the same number of nodes is to be split.
1919          */
1920         for (i = 0, frame = path->ip_frame; frame >= path->ip_frames &&
1921              dx_get_count(frame->entries) == dx_get_limit(frame->entries);
1922              --frame, ++i) {
1923                 ;
1924         }
1925         if (i != nr_splet) {
1926                 err = -EAGAIN;
1927                 goto cleanup;
1928         }
1929
1930         /*
1931          * Go back down, allocating blocks, locking them, and adding into
1932          * transaction...
1933          */
1934         for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
1935                 bh_new[i] = iam_new_node(handle, path->ip_container,
1936                                          &newblock[i], &err);
1937                 do_corr(schedule());
1938                 if (!bh_new[i] ||
1939                     descr->id_ops->id_node_init(path->ip_container,
1940                                                 bh_new[i], 0) != 0)
1941                         goto cleanup;
1942
1943                 new_lock[i] = iam_lock_htree(path->ip_container, newblock[i],
1944                                              DLT_WRITE);
1945                 if (new_lock[i] == NULL) {
1946                         err = -ENOMEM;
1947                         goto cleanup;
1948                 }
1949                 do_corr(schedule());
1950                 BUFFER_TRACE(frame->bh, "get_write_access");
1951                 err = ldiskfs_journal_get_write_access(handle, frame->bh);
1952                 if (err)
1953                         goto journal_error;
1954         }
1955         /* Add "safe" node to transaction too */
1956         if (safe + 1 != path->ip_frames) {
1957                 do_corr(schedule());
1958                 err = ldiskfs_journal_get_write_access(handle, safe->bh);
1959                 if (err)
1960                         goto journal_error;
1961         }
1962
1963         /* Go through nodes once more, inserting pointers */
1964         for (frame = safe + 1, i = 0; i < nr_splet; ++i, ++frame) {
1965                 unsigned count;
1966                 int idx;
1967                 struct buffer_head *bh2;
1968                 struct buffer_head *bh;
1969
1970                 entries = frame->entries;
1971                 count = dx_get_count(entries);
1972                 idx = iam_entry_diff(path, frame->at, entries);
1973
1974                 bh2 = bh_new[i];
1975                 entries2 = dx_get_entries(path, bh2->b_data, 0);
1976
1977                 bh = frame->bh;
1978                 if (frame == path->ip_frames) {
1979                         /* splitting root node. Tricky point:
1980                          *
1981                          * In the "normal" B-tree we'd split root *and* add
1982                          * new root to the tree with pointers to the old root
1983                          * and its sibling (thus introducing two new nodes).
1984                          *
1985                          * In htree it's enough to add one node, because
1986                          * capacity of the root node is smaller than that of
1987                          * non-root one.
1988                          */
1989                         struct iam_frame *frames;
1990                         struct iam_entry *next;
1991
1992                         assert_corr(i == 0);
1993
1994                         do_corr(schedule());
1995
1996                         frames = path->ip_frames;
1997                         memcpy((char *) entries2, (char *) entries,
1998                                count * iam_entry_size(path));
1999                         dx_set_limit(entries2, dx_node_limit(path));
2000
2001                         /* Set up root */
2002                         iam_lock_bh(frame->bh);
2003                         next = descr->id_ops->id_root_inc(path->ip_container,
2004                                                           path, frame);
2005                         dx_set_block(path, next, newblock[0]);
2006                         iam_unlock_bh(frame->bh);
2007
2008                         do_corr(schedule());
2009                         /* Shift frames in the path */
2010                         memmove(frames + 2, frames + 1,
2011                                (sizeof path->ip_frames) - 2 * sizeof frames[0]);
2012                         /* Add new access path frame */
2013                         frames[1].at = iam_entry_shift(path, entries2, idx);
2014                         frames[1].entries = entries = entries2;
2015                         frames[1].bh = bh2;
2016                         assert_inv(dx_node_check(path, frame));
2017                         ++ path->ip_frame;
2018                         ++ frame;
2019                         assert_inv(dx_node_check(path, frame));
2020                         bh_new[0] = NULL; /* buffer head is "consumed" */
2021                         err = ldiskfs_handle_dirty_metadata(handle, NULL, bh2);
2022                         if (err)
2023                                 goto journal_error;
2024                         do_corr(schedule());
2025                 } else {
2026                         /* splitting non-root index node. */
2027                         struct iam_frame *parent = frame - 1;
2028
2029                         do_corr(schedule());
2030                         count = iam_shift_entries(path, frame, count,
2031                                                 entries, entries2, newblock[i]);
2032                         /* Which index block gets the new entry? */
2033                         if (idx >= count) {
2034                                 int d = dx_index_is_compat(path) ? 0 : +1;
2035
2036                                 frame->at = iam_entry_shift(path, entries2,
2037                                                             idx - count + d);
2038                                 frame->entries = entries = entries2;
2039                                 frame->curidx = newblock[i];
2040                                 swap(frame->bh, bh2);
2041                                 assert_corr(lock[i + 1] != NULL);
2042                                 assert_corr(new_lock[i] != NULL);
2043                                 swap(lock[i + 1], new_lock[i]);
2044                                 bh_new[i] = bh2;
2045                                 parent->at = iam_entry_shift(path,
2046                                                              parent->at, +1);
2047                         }
2048                         assert_inv(dx_node_check(path, frame));
2049                         assert_inv(dx_node_check(path, parent));
2050                         dxtrace(dx_show_index("node", frame->entries));
2051                         dxtrace(dx_show_index("node",
2052                                 ((struct dx_node *) bh2->b_data)->entries));
2053                         err = ldiskfs_handle_dirty_metadata(handle, NULL, bh2);
2054                         if (err)
2055                                 goto journal_error;
2056                         do_corr(schedule());
2057                         err = ldiskfs_handle_dirty_metadata(handle, NULL,
2058                                                             parent->bh);
2059                         if (err)
2060                                 goto journal_error;
2061                 }
2062                 do_corr(schedule());
2063                 err = ldiskfs_handle_dirty_metadata(handle, NULL, bh);
2064                 if (err)
2065                         goto journal_error;
2066         }
2067                 /*
2068                  * This function was called to make insertion of new leaf
2069                  * possible. Check that it fulfilled its obligations.
2070                  */
2071                 assert_corr(dx_get_count(path->ip_frame->entries) <
2072                             dx_get_limit(path->ip_frame->entries));
2073         assert_corr(lock[nr_splet] != NULL);
2074         *lh = lock[nr_splet];
2075         lock[nr_splet] = NULL;
2076         if (nr_splet > 0) {
2077                 /*
2078                  * Log ->i_size modification.
2079                  */
2080                 err = ldiskfs_mark_inode_dirty(handle, dir);
2081                 if (err)
2082                         goto journal_error;
2083         }
2084         goto cleanup;
2085 journal_error:
2086         ldiskfs_std_error(dir->i_sb, err);
2087
2088 cleanup:
2089         iam_unlock_array(path->ip_container, lock);
2090         iam_unlock_array(path->ip_container, new_lock);
2091
2092         assert_corr(err || iam_frame_is_locked(path, path->ip_frame));
2093
2094         do_corr(schedule());
2095         for (i = 0; i < ARRAY_SIZE(bh_new); ++i) {
2096                 if (bh_new[i] != NULL)
2097                         brelse(bh_new[i]);
2098         }
2099         return err;
2100 }
2101
2102 static int iam_add_rec(handle_t *handle, struct iam_iterator *it,
2103                        struct iam_path *path,
2104                        const struct iam_key *k, const struct iam_rec *r)
2105 {
2106         int err;
2107         struct iam_leaf *leaf;
2108
2109         leaf = &path->ip_leaf;
2110         assert_inv(iam_path_check(path));
2111         err = iam_txn_add(handle, path, leaf->il_bh);
2112         if (err == 0) {
2113                 do_corr(schedule());
2114                 if (!iam_leaf_can_add(leaf, k, r)) {
2115                         struct dynlock_handle *lh = NULL;
2116
2117                         do {
2118                                 assert_corr(lh == NULL);
2119                                 do_corr(schedule());
2120                                 err = split_index_node(handle, path, &lh);
2121                                 if (err == -EAGAIN) {
2122                                         assert_corr(lh == NULL);
2123
2124                                         iam_path_fini(path);
2125                                         it->ii_state = IAM_IT_DETACHED;
2126
2127                                         do_corr(schedule());
2128                                         err = iam_it_get_exact(it, k);
2129                                         if (err == -ENOENT)
2130                                                 err = +1; /* repeat split */
2131                                         else if (err == 0)
2132                                                 err = -EEXIST;
2133                                 }
2134                         } while (err > 0);
2135                         assert_inv(iam_path_check(path));
2136                         if (err == 0) {
2137                                 assert_corr(lh != NULL);
2138                                 do_corr(schedule());
2139                                 err = iam_new_leaf(handle, leaf);
2140                                 if (err == 0)
2141                                         err = iam_txn_dirty(handle, path,
2142                                                             path->ip_frame->bh);
2143                         }
2144                         iam_unlock_htree(path->ip_container, lh);
2145                         do_corr(schedule());
2146                 }
2147                 if (err == 0) {
2148                         iam_leaf_rec_add(leaf, k, r);
2149                         err = iam_txn_dirty(handle, path, leaf->il_bh);
2150                 }
2151         }
2152         assert_inv(iam_path_check(path));
2153         return err;
2154 }
2155
2156 /*
2157  * Insert new record with key @k and contents from @r, shifting records to the
2158  * right. On success, iterator is positioned on the newly inserted record.
2159  *
2160  * precondition: it->ii_flags&IAM_IT_WRITE &&
2161  *               (it_state(it) == IAM_IT_ATTACHED ||
2162  *                it_state(it) == IAM_IT_SKEWED) &&
2163  *               ergo(it_state(it) == IAM_IT_ATTACHED,
2164  *                    it_keycmp(it, k) <= 0) &&
2165  *               ergo(it_before(it), it_keycmp(it, k) > 0));
2166  * postcondition: ergo(result == 0,
2167  *                     it_state(it) == IAM_IT_ATTACHED &&
2168  *                     it_keycmp(it, k) == 0 &&
2169  *                     !memcmp(iam_it_rec_get(it), r, ...))
2170  */
2171 int iam_it_rec_insert(handle_t *h, struct iam_iterator *it,
2172                       const struct iam_key *k, const struct iam_rec *r)
2173 {
2174         int result;
2175         struct iam_path *path;
2176
2177         path = &it->ii_path;
2178
2179         assert_corr(it->ii_flags&IAM_IT_WRITE);
2180         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
2181                     it_state(it) == IAM_IT_SKEWED);
2182         assert_corr(ergo(it_state(it) == IAM_IT_ATTACHED,
2183                     it_keycmp(it, k) <= 0));
2184         assert_corr(ergo(it_before(it), it_keycmp(it, k) > 0));
2185         result = iam_add_rec(h, it, path, k, r);
2186         if (result == 0)
2187                 it->ii_state = IAM_IT_ATTACHED;
2188         assert_corr(ergo(result == 0,
2189                          it_state(it) == IAM_IT_ATTACHED &&
2190                          it_keycmp(it, k) == 0));
2191         return result;
2192 }
2193
2194 static inline int iam_idle_blocks_limit(struct inode *inode)
2195 {
2196         return (inode->i_sb->s_blocksize - sizeof(struct iam_idle_head)) >> 2;
2197 }
2198
2199 /*
2200  * If the leaf cannnot be recycled, we will lose one block for reusing.
2201  * It is not a serious issue because it almost the same of non-recycle.
2202  */
2203 static iam_ptr_t iam_index_shrink(handle_t *h, struct iam_path *p,
2204                                   struct iam_leaf *l, struct buffer_head **bh)
2205 {
2206         struct iam_container *c = p->ip_container;
2207         struct inode *inode = c->ic_object;
2208         struct iam_frame *frame = p->ip_frame;
2209         struct iam_entry *entries;
2210         struct iam_entry *pos;
2211         struct dynlock_handle *lh;
2212         int count;
2213         int rc;
2214
2215         if (c->ic_idle_failed)
2216                 return 0;
2217
2218         if (unlikely(frame == NULL))
2219                 return 0;
2220
2221         if (!iam_leaf_empty(l))
2222                 return 0;
2223
2224         lh = iam_lock_htree(c, frame->curidx, DLT_WRITE);
2225         if (lh == NULL) {
2226                 CWARN("%s: No memory to recycle idle blocks\n",
2227                       osd_ino2name(inode));
2228                 return 0;
2229         }
2230
2231         rc = iam_txn_add(h, p, frame->bh);
2232         if (rc != 0) {
2233                 iam_unlock_htree(c, lh);
2234                 return 0;
2235         }
2236
2237         iam_lock_bh(frame->bh);
2238         entries = frame->entries;
2239         count = dx_get_count(entries);
2240         /*
2241          * NOT shrink the last entry in the index node, which can be reused
2242          * directly by next new node.
2243          */
2244         if (count == 2) {
2245                 iam_unlock_bh(frame->bh);
2246                 iam_unlock_htree(c, lh);
2247                 return 0;
2248         }
2249
2250         pos = iam_find_position(p, frame);
2251         /*
2252          * There may be some new leaf nodes have been added or empty leaf nodes
2253          * have been shrinked during my delete operation.
2254          *
2255          * If the empty leaf is not under current index node because the index
2256          * node has been split, then just skip the empty leaf, which is rare.
2257          */
2258         if (unlikely(frame->leaf != dx_get_block(p, pos))) {
2259                 iam_unlock_bh(frame->bh);
2260                 iam_unlock_htree(c, lh);
2261                 return 0;
2262         }
2263
2264         frame->at = pos;
2265         if (frame->at < iam_entry_shift(p, entries, count - 1)) {
2266                 struct iam_entry *n = iam_entry_shift(p, frame->at, 1);
2267
2268                 memmove(frame->at, n,
2269                         (char *)iam_entry_shift(p, entries, count) - (char *)n);
2270                 frame->at_shifted = 1;
2271         }
2272         dx_set_count(entries, count - 1);
2273         iam_unlock_bh(frame->bh);
2274         rc = iam_txn_dirty(h, p, frame->bh);
2275         iam_unlock_htree(c, lh);
2276         if (rc != 0)
2277                 return 0;
2278
2279         get_bh(l->il_bh);
2280         *bh = l->il_bh;
2281         return frame->leaf;
2282 }
2283
2284 static int
2285 iam_install_idle_blocks(handle_t *h, struct iam_path *p, struct buffer_head *bh,
2286                         __u32 *idle_blocks, iam_ptr_t blk)
2287 {
2288         struct iam_container *c = p->ip_container;
2289         struct buffer_head *old = c->ic_idle_bh;
2290         struct iam_idle_head *head;
2291         int rc;
2292
2293         head = (struct iam_idle_head *)(bh->b_data);
2294         head->iih_magic = cpu_to_le16(IAM_IDLE_HEADER_MAGIC);
2295         head->iih_count = 0;
2296         head->iih_next = *idle_blocks;
2297         /* The bh already get_write_accessed. */
2298         rc = iam_txn_dirty(h, p, bh);
2299         if (rc != 0)
2300                 return rc;
2301
2302         rc = iam_txn_add(h, p, c->ic_root_bh);
2303         if (rc != 0)
2304                 return rc;
2305
2306         iam_lock_bh(c->ic_root_bh);
2307         *idle_blocks = cpu_to_le32(blk);
2308         iam_unlock_bh(c->ic_root_bh);
2309         rc = iam_txn_dirty(h, p, c->ic_root_bh);
2310         if (rc == 0) {
2311                 /* NOT release old before new assigned. */
2312                 get_bh(bh);
2313                 c->ic_idle_bh = bh;
2314                 brelse(old);
2315         } else {
2316                 iam_lock_bh(c->ic_root_bh);
2317                 *idle_blocks = head->iih_next;
2318                 iam_unlock_bh(c->ic_root_bh);
2319         }
2320         return rc;
2321 }
2322
2323 /*
2324  * If the leaf cannnot be recycled, we will lose one block for reusing.
2325  * It is not a serious issue because it almost the same of non-recycle.
2326  */
2327 static void iam_recycle_leaf(handle_t *h, struct iam_path *p,
2328                              struct buffer_head *bh, iam_ptr_t blk)
2329 {
2330         struct iam_container *c = p->ip_container;
2331         struct inode *inode = c->ic_object;
2332         struct iam_idle_head *head;
2333         __u32 *idle_blocks;
2334         int count;
2335         int rc;
2336
2337         mutex_lock(&c->ic_idle_mutex);
2338         if (unlikely(c->ic_idle_failed)) {
2339                 rc = -EFAULT;
2340                 goto unlock;
2341         }
2342
2343         idle_blocks = (__u32 *)(c->ic_root_bh->b_data +
2344                                 c->ic_descr->id_root_gap +
2345                                 sizeof(struct dx_countlimit));
2346         /* It is the first idle block. */
2347         if (c->ic_idle_bh == NULL) {
2348                 rc = iam_install_idle_blocks(h, p, bh, idle_blocks, blk);
2349                 goto unlock;
2350         }
2351
2352         head = (struct iam_idle_head *)(c->ic_idle_bh->b_data);
2353         count = le16_to_cpu(head->iih_count);
2354         /* Current ic_idle_bh is full, to be replaced by the leaf. */
2355         if (count == iam_idle_blocks_limit(inode)) {
2356                 rc = iam_install_idle_blocks(h, p, bh, idle_blocks, blk);
2357                 goto unlock;
2358         }
2359
2360         /* Just add to ic_idle_bh. */
2361         rc = iam_txn_add(h, p, c->ic_idle_bh);
2362         if (rc != 0)
2363                 goto unlock;
2364
2365         head->iih_blks[count] = cpu_to_le32(blk);
2366         head->iih_count = cpu_to_le16(count + 1);
2367         rc = iam_txn_dirty(h, p, c->ic_idle_bh);
2368
2369 unlock:
2370         mutex_unlock(&c->ic_idle_mutex);
2371         if (rc != 0)
2372                 CWARN("%s: idle blocks failed, will lose the blk %u\n",
2373                       osd_ino2name(inode), blk);
2374 }
2375
2376 /*
2377  * Delete record under iterator.
2378  *
2379  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
2380  *                it->ii_flags&IAM_IT_WRITE &&
2381  *                it_at_rec(it)
2382  * postcondition: it_state(it) == IAM_IT_ATTACHED ||
2383  *                it_state(it) == IAM_IT_DETACHED
2384  */
2385 int iam_it_rec_delete(handle_t *h, struct iam_iterator *it)
2386 {
2387         int result;
2388         struct iam_leaf *leaf;
2389         struct iam_path *path;
2390
2391         assert_corr(it_state(it) == IAM_IT_ATTACHED &&
2392                     it->ii_flags&IAM_IT_WRITE);
2393         assert_corr(it_at_rec(it));
2394
2395         path = &it->ii_path;
2396         leaf = &path->ip_leaf;
2397
2398         assert_inv(iam_path_check(path));
2399
2400         result = iam_txn_add(h, path, leaf->il_bh);
2401         /*
2402          * no compaction for now.
2403          */
2404         if (result == 0) {
2405                 iam_rec_del(leaf, it->ii_flags&IAM_IT_MOVE);
2406                 result = iam_txn_dirty(h, path, leaf->il_bh);
2407                 if (result == 0 && iam_leaf_at_end(leaf)) {
2408                         struct buffer_head *bh = NULL;
2409                         iam_ptr_t blk;
2410
2411                         blk = iam_index_shrink(h, path, leaf, &bh);
2412                         if (it->ii_flags & IAM_IT_MOVE) {
2413                                 result = iam_it_next(it);
2414                                 if (result > 0)
2415                                         result = 0;
2416                         }
2417
2418                         if (bh != NULL) {
2419                                 iam_recycle_leaf(h, path, bh, blk);
2420                                 brelse(bh);
2421                         }
2422                 }
2423         }
2424         assert_inv(iam_path_check(path));
2425         assert_corr(it_state(it) == IAM_IT_ATTACHED ||
2426                     it_state(it) == IAM_IT_DETACHED);
2427         return result;
2428 }
2429
2430 /*
2431  * Convert iterator to cookie.
2432  *
2433  * precondition:  it_state(it) == IAM_IT_ATTACHED &&
2434  *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
2435  * postcondition: it_state(it) == IAM_IT_ATTACHED
2436  */
2437 iam_pos_t iam_it_store(const struct iam_iterator *it)
2438 {
2439         iam_pos_t result;
2440
2441         assert_corr(it_state(it) == IAM_IT_ATTACHED);
2442         assert_corr(it_at_rec(it));
2443         assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <=
2444                     sizeof result);
2445
2446         result = 0;
2447         return *(iam_pos_t *)iam_it_ikey_get(it, (void *)&result);
2448 }
2449
2450 /*
2451  * Restore iterator from cookie.
2452  *
2453  * precondition:  it_state(it) == IAM_IT_DETACHED && it->ii_flags&IAM_IT_MOVE &&
2454  *                iam_path_descr(it->ii_path)->id_key_size <= sizeof(iam_pos_t)
2455  * postcondition: ergo(result == 0, it_state(it) == IAM_IT_ATTACHED &&
2456  *                                  iam_it_store(it) == pos)
2457  */
2458 int iam_it_load(struct iam_iterator *it, iam_pos_t pos)
2459 {
2460         assert_corr(it_state(it) == IAM_IT_DETACHED &&
2461                 it->ii_flags&IAM_IT_MOVE);
2462         assert_corr(iam_it_container(it)->ic_descr->id_ikey_size <= sizeof pos);
2463         return iam_it_iget(it, (struct iam_ikey *)&pos);
2464 }
2465
2466 /***********************************************************************/
2467 /* invariants                                                          */
2468 /***********************************************************************/
2469
2470 static inline int ptr_inside(void *base, size_t size, void *ptr)
2471 {
2472         return (base <= ptr) && (ptr < base + size);
2473 }
2474
2475 static int iam_frame_invariant(struct iam_frame *f)
2476 {
2477         return
2478                 (f->bh != NULL &&
2479                 f->bh->b_data != NULL &&
2480                 ptr_inside(f->bh->b_data, f->bh->b_size, f->entries) &&
2481                 ptr_inside(f->bh->b_data, f->bh->b_size, f->at) &&
2482                 f->entries <= f->at);
2483 }
2484
2485 static int iam_leaf_invariant(struct iam_leaf *l)
2486 {
2487         return
2488                 l->il_bh != NULL &&
2489                 l->il_bh->b_data != NULL &&
2490                 ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_entries) &&
2491                 ptr_inside(l->il_bh->b_data, l->il_bh->b_size, l->il_at) &&
2492                 l->il_entries <= l->il_at;
2493 }
2494
2495 static int iam_path_invariant(struct iam_path *p)
2496 {
2497         int i;
2498
2499         if (p->ip_container == NULL ||
2500             p->ip_indirect < 0 || p->ip_indirect > DX_MAX_TREE_HEIGHT - 1 ||
2501             p->ip_frame != p->ip_frames + p->ip_indirect ||
2502             !iam_leaf_invariant(&p->ip_leaf))
2503                 return 0;
2504         for (i = 0; i < ARRAY_SIZE(p->ip_frames); ++i) {
2505                 if (i <= p->ip_indirect) {
2506                         if (!iam_frame_invariant(&p->ip_frames[i]))
2507                                 return 0;
2508                 }
2509         }
2510         return 1;
2511 }
2512
2513 int iam_it_invariant(struct iam_iterator *it)
2514 {
2515         return
2516                 (it->ii_state == IAM_IT_DETACHED ||
2517                 it->ii_state == IAM_IT_ATTACHED ||
2518                 it->ii_state == IAM_IT_SKEWED) &&
2519                 !(it->ii_flags & ~(IAM_IT_MOVE | IAM_IT_WRITE)) &&
2520                 ergo(it->ii_state == IAM_IT_ATTACHED ||
2521                 it->ii_state == IAM_IT_SKEWED,
2522                 iam_path_invariant(&it->ii_path) &&
2523                 equi(it_at_rec(it), it->ii_state == IAM_IT_SKEWED));
2524 }
2525
2526 /*
2527  * Search container @c for record with key @k. If record is found, its data
2528  * are moved into @r.
2529  *
2530  * Return values: 0: found, -ENOENT: not-found, -ve: error
2531  */
2532 int iam_lookup(struct iam_container *c, const struct iam_key *k,
2533                struct iam_rec *r, struct iam_path_descr *pd)
2534 {
2535         struct iam_iterator it;
2536         int result;
2537
2538         iam_it_init(&it, c, 0, pd);
2539
2540         result = iam_it_get_exact(&it, k);
2541         if (result == 0)
2542                 /*
2543                  * record with required key found, copy it into user buffer
2544                  */
2545                 iam_reccpy(&it.ii_path.ip_leaf, r);
2546         iam_it_put(&it);
2547         iam_it_fini(&it);
2548         return result;
2549 }
2550
2551 /*
2552  * Insert new record @r with key @k into container @c (within context of
2553  * transaction @h).
2554  *
2555  * Return values: 0: success, -ve: error, including -EEXIST when record with
2556  * given key is already present.
2557  *
2558  * postcondition: ergo(result == 0 || result == -EEXIST,
2559  *                                  iam_lookup(c, k, r2) > 0;
2560  */
2561 int iam_insert(handle_t *h, struct iam_container *c, const struct iam_key *k,
2562                const struct iam_rec *r, struct iam_path_descr *pd)
2563 {
2564         struct iam_iterator it;
2565         int result;
2566
2567         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2568
2569         result = iam_it_get_exact(&it, k);
2570         if (result == -ENOENT)
2571                 result = iam_it_rec_insert(h, &it, k, r);
2572         else if (result == 0)
2573                 result = -EEXIST;
2574         iam_it_put(&it);
2575         iam_it_fini(&it);
2576         return result;
2577 }
2578
2579 /*
2580  * Update record with the key @k in container @c (within context of
2581  * transaction @h), new record is given by @r.
2582  *
2583  * Return values: +1: skip because of the same rec value, 0: success,
2584  * -ve: error, including -ENOENT if no record with the given key found.
2585  */
2586 int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k,
2587                const struct iam_rec *r, struct iam_path_descr *pd)
2588 {
2589         struct iam_iterator it;
2590         struct iam_leaf *folio;
2591         int result;
2592
2593         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2594
2595         result = iam_it_get_exact(&it, k);
2596         if (result == 0) {
2597                 folio = &it.ii_path.ip_leaf;
2598                 result = iam_leaf_ops(folio)->rec_eq(folio, r);
2599                 if (result == 0)
2600                         iam_it_rec_set(h, &it, r);
2601                 else
2602                         result = 1;
2603         }
2604         iam_it_put(&it);
2605         iam_it_fini(&it);
2606         return result;
2607 }
2608
2609 /*
2610  * Delete existing record with key @k.
2611  *
2612  * Return values: 0: success, -ENOENT: not-found, -ve: other error.
2613  *
2614  * postcondition: ergo(result == 0 || result == -ENOENT,
2615  *                                 !iam_lookup(c, k, *));
2616  */
2617 int iam_delete(handle_t *h, struct iam_container *c, const struct iam_key *k,
2618                struct iam_path_descr *pd)
2619 {
2620         struct iam_iterator it;
2621         int result;
2622
2623         iam_it_init(&it, c, IAM_IT_WRITE, pd);
2624
2625         result = iam_it_get_exact(&it, k);
2626         if (result == 0)
2627                 iam_it_rec_delete(h, &it);
2628         iam_it_put(&it);
2629         iam_it_fini(&it);
2630         return result;
2631 }
2632
2633 int iam_root_limit(int rootgap, int blocksize, int size)
2634 {
2635         int limit;
2636         int nlimit;
2637
2638         limit = (blocksize - rootgap) / size;
2639         nlimit = blocksize / size;
2640         if (limit == nlimit)
2641                 limit--;
2642         return limit;
2643 }