Whamcloud - gitweb
b=19387 integrate LST into acc-sm
[fs/lustre-release.git] / lustre / osd / osd_iam_lvar.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see [sun.com URL with a
20  * copy of GPLv2].
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * iam_lvar.c
37  *
38  * implementation of iam format for fixed size records, variable sized keys.
39  *
40  * Author: Nikita Danilov <nikita@clusterfs.com>
41  */
42
43 #include <linux/types.h>
44 #include "osd_internal.h"
45
46 /*
47  * Leaf operations.
48  */
49
50 enum {
51         IAM_LVAR_LEAF_MAGIC = 0x1973 /* This is duplicated in
52                                       * lustre/utils/create_iam.c */
53 };
54
55 /* This is duplicated in lustre/utils/create_iam.c */
56 struct lvar_leaf_header {
57         __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
58         __le16 vlh_used;  /* used bytes, including header */
59 };
60
61 /*
62  * Format of leaf entry:
63  *
64  * __le16 keysize
65  *     u8 key[keysize]
66  *     u8 record[rec_size]
67  *
68  * Entries are ordered in key order.
69  */
70
71 /* This is duplicated in lustre/utils/create_iam.c */
72 typedef __u32 lvar_hash_t;
73
74 /* This is duplicated in lustre/utils/create_iam.c */
75 struct lvar_leaf_entry {
76         __le32 vle_hash;
77         __le16 vle_keysize;
78         u8     vle_key[0];
79 };
80
81 #define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
82
83
84 static inline int blocksize(const struct iam_leaf *leaf)
85 {
86         return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
87 }
88
89 static inline const char *kchar(const struct iam_key *key)
90 {
91         return (void *)key;
92 }
93
94 static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
95 {
96         return (struct iam_lentry *)ent;
97 }
98
99 static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
100 {
101         return (struct lvar_leaf_entry *)lent;
102 }
103
104
105 static inline int e_keysize(const struct lvar_leaf_entry *ent)
106 {
107         return le16_to_cpu(ent->vle_keysize);
108 }
109
110 /* This is duplicated in lustre/utils/create_iam.c */
111 enum {
112         LVAR_PAD   = 4,
113         LVAR_ROUND = LVAR_PAD - 1
114 };
115
116 static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
117 {
118         CLASSERT(!(LVAR_PAD & (LVAR_PAD - 1)));
119
120         return (offsetof(struct lvar_leaf_entry, vle_key) +
121                 namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
122 }
123
124 static inline int rec_size(const struct iam_rec *rec)
125 {
126         return *(const char *)rec;
127 }
128
129 static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
130 {
131         return ((void *)ent) +
132                 offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
133 }
134
135 static inline int e_size(const struct iam_leaf *leaf,
136                          const struct lvar_leaf_entry *ent)
137 {
138         return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
139 }
140
141 static inline char *e_char(const struct lvar_leaf_entry *ent)
142 {
143         return (char *)&ent->vle_key;
144 }
145
146 static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
147 {
148         return (struct iam_key *)e_char(ent);
149 }
150
151 static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
152 {
153         return le32_to_cpu(ent->vle_hash);
154 }
155
156 static void e_print(const struct lvar_leaf_entry *ent)
157 {
158         printk("        %p %8.8x \"%*.*s\"\n", ent, e_hash(ent),
159                e_keysize(ent), e_keysize(ent), e_char(ent));
160 }
161 #if 0
162 static int e_check(const struct iam_leaf *leaf,
163                    const struct lvar_leaf_entry *ent)
164 {
165         const void *point = ent;
166         const void *start = leaf->il_bh->b_data;
167         return
168                 start + sizeof(struct lvar_leaf_header) <= point &&
169                 point + e_size(leaf, ent) < start + blocksize(leaf);
170 }
171 #endif
172
173 static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
174                                              const struct lvar_leaf_entry *ent)
175 {
176         return ((void *)ent) + e_size(leaf, ent);
177 }
178
179 #define LVAR_HASH_SANDWICH  (0)
180 #define LVAR_HASH_TEA       (1)
181 #define LVAR_HASH_R5        (0)
182 #define LVAR_HASH_PREFIX    (0)
183
184 static __u32 hash_build0(const char *name, int namelen)
185 {
186         __u32 result;
187
188         if (namelen == 0)
189                 return 0;
190         if (strncmp(name, ".", 1) == 0 && namelen == 1)
191                 return 1;
192         if (strncmp(name, "..", 2) == 0 && namelen == 2)
193                 return 2;
194
195         if (LVAR_HASH_PREFIX) {
196                 result = 0;
197                 strncpy((void *)&result,
198                         name, min(namelen, (int)sizeof result));
199         } else {
200                 struct ldiskfs_dx_hash_info hinfo;
201
202                 hinfo.hash_version = LDISKFS_DX_HASH_TEA;
203                 hinfo.seed = 0;
204                 ldiskfsfs_dirhash(name, namelen, &hinfo);
205                 result = hinfo.hash;
206                 if (LVAR_HASH_SANDWICH) {
207                         __u32 result2;
208
209                         hinfo.hash_version = LDISKFS_DX_HASH_TEA;
210                         hinfo.seed = 0;
211                         ldiskfsfs_dirhash(name, namelen, &hinfo);
212                         result2 = hinfo.hash;
213                         result = (0xfc000000 & result2) | (0x03ffffff & result);
214                 }
215         }
216         return result;
217 }
218
219 enum {
220         HASH_GRAY_AREA = 1024,
221         HASH_MAX_SIZE  = 0x7fffffffUL
222 };
223
224 static __u32 hash_build(const char *name, int namelen)
225 {
226         __u32 hash;
227
228         hash = (hash_build0(name, namelen) << 1) & HASH_MAX_SIZE;
229         if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA)
230                 hash &= HASH_GRAY_AREA - 1;
231         return hash;
232 }
233
234 static inline lvar_hash_t get_hash(const struct iam_container *bag,
235                                    const char *name, int namelen)
236 {
237         return hash_build(name, namelen);
238 }
239
240 static inline int e_eq(const struct lvar_leaf_entry *ent,
241                        const char *name, int namelen)
242 {
243         return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
244 }
245
246 static inline int e_cmp(const struct iam_leaf *leaf,
247                         const struct lvar_leaf_entry *ent, lvar_hash_t hash)
248 {
249         lvar_hash_t ehash;
250
251         ehash = e_hash(ent);
252         return ehash == hash ? 0 : (ehash < hash ? -1 : +1);
253 }
254
255 static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
256 {
257         return (struct lvar_leaf_header *)l->il_bh->b_data;
258 }
259
260 static int h_used(const struct lvar_leaf_header *hdr)
261 {
262         return le16_to_cpu(hdr->vlh_used);
263 }
264
265 static void h_used_adj(const struct iam_leaf *leaf,
266                        struct lvar_leaf_header *hdr, int adj)
267 {
268         int used;
269
270         used = h_used(hdr) + adj;
271         assert_corr(sizeof *hdr <= used && used <= blocksize(leaf));
272         hdr->vlh_used = cpu_to_le16(used);
273 }
274
275 static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
276 {
277         return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
278 }
279
280 static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
281 {
282         return (void *)l->il_bh->b_data + h_used(n_head(l));
283 }
284
285 static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
286 {
287         return lentry_lvar(l->il_at);
288 }
289
290 void n_print(const struct iam_leaf *l)
291 {
292         struct lvar_leaf_entry *scan;
293
294         printk(CFS_KERN_EMERG "used: %d\n", h_used(n_head(l)));
295         for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan))
296                 e_print(scan);
297 }
298
299 #if LDISKFS_CORRECTNESS_ON
300 static int n_at_rec(const struct iam_leaf *folio)
301 {
302         return
303                 n_start(folio) <= lentry_lvar(folio->il_at) &&
304                 lentry_lvar(folio->il_at) < n_end(folio);
305 }
306
307 #if LDISKFS_INVARIANT_ON
308 static int n_invariant(const struct iam_leaf *leaf)
309 {
310         struct iam_path        *path;
311         struct lvar_leaf_entry *scan;
312         struct lvar_leaf_entry *end;
313         lvar_hash_t             hash;
314         lvar_hash_t             nexthash;
315         lvar_hash_t             starthash;
316
317         end  = n_end(leaf);
318         hash = 0;
319         path = leaf->il_path;
320
321         if (h_used(n_head(leaf)) > blocksize(leaf))
322                 return 0;
323
324         /*
325          * Delimiting key in the parent index node. Clear least bit to account
326          * for hash collision marker.
327          */
328         starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
329         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
330                 nexthash = e_hash(scan);
331                 if (nexthash != get_hash(iam_leaf_container(leaf),
332                                          e_char(scan), e_keysize(scan))) {
333                         BREAKPOINT();
334                         return 0;
335                 }
336                 if (0 && nexthash < starthash) {
337                         /*
338                          * Unfortunately this useful invariant cannot be
339                          * reliably checked as parent node is nor necessarily
340                          * locked.
341                          */
342                         n_print(leaf);
343                         printk("%#x < %#x\n", nexthash, starthash);
344                         dump_stack();
345                         return 0;
346                 }
347                 if (nexthash < hash) {
348                         BREAKPOINT();
349                         return 0;
350                 }
351                 hash = nexthash;
352         }
353         if (scan != end) {
354                 BREAKPOINT();
355                 return 0;
356         }
357         return 1;
358 }
359 /* LDISKFS_INVARIANT_ON */
360 #endif
361
362 /* LDISKFS_CORRECTNESS_ON */
363 #endif
364
365 static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
366                                   struct iam_ikey *key)
367 {
368         lvar_hash_t *hash;
369
370         assert_corr(n_at_rec(l));
371
372         hash = (void *)key;
373         *hash = e_hash(n_cur(l));
374         return key;
375 }
376
377 static struct iam_key *lvar_key(const struct iam_leaf *l)
378 {
379         return e_key(n_cur(l));
380 }
381
382 static int lvar_key_size(const struct iam_leaf *l)
383 {
384         return e_keysize(n_cur(l));
385 }
386
387 static void lvar_start(struct iam_leaf *l)
388 {
389         l->il_at = lvar_lentry(n_start(l));
390 }
391
392 static int lvar_init(struct iam_leaf *l)
393 {
394         int result;
395         int used;
396         struct lvar_leaf_header *head;
397
398         assert_corr(l->il_bh != NULL);
399
400         head = n_head(l);
401         used = h_used(head);
402         if (head->vlh_magic == le16_to_cpu(IAM_LVAR_LEAF_MAGIC) &&
403             used <= blocksize(l)) {
404                 l->il_at = l->il_entries = lvar_lentry(n_start(l));
405                 result = 0;
406         } else {
407                 struct inode *obj;
408
409                 obj = iam_leaf_container(l)->ic_object;
410                 CERROR("Wrong magic in node %llu (#%lu): %#x != %#x or "
411                        "wrong used: %i",
412                        (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
413                        head->vlh_magic, le16_to_cpu(IAM_LVAR_LEAF_MAGIC),
414                        used);
415                 result = -EIO;
416         }
417         return result;
418 }
419
420 static void lvar_fini(struct iam_leaf *l)
421 {
422         l->il_entries = l->il_at = NULL;
423 }
424
425 struct iam_rec *lvar_rec(const struct iam_leaf *l)
426 {
427         assert_corr(n_at_rec(l));
428         return e_rec(n_cur(l));
429 }
430
431 static void lvar_next(struct iam_leaf *l)
432 {
433         assert_corr(n_at_rec(l));
434         assert_corr(iam_leaf_is_locked(l));
435         l->il_at = lvar_lentry(e_next(l, n_cur(l)));
436 }
437
438 static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
439 {
440         struct lvar_leaf_entry *found;
441         struct lvar_leaf_entry *scan;
442         struct lvar_leaf_entry *end;
443         int                     result;
444         const char             *name;
445         int                     namelen;
446         int                     found_equal;
447         lvar_hash_t             hash;
448         int                     last;
449
450         assert_inv(n_invariant(leaf));
451         end = n_end(leaf);
452
453         name = kchar(k);
454         namelen = strlen(name);
455         hash = get_hash(iam_leaf_container(leaf), name, namelen);
456         found = NULL;
457         found_equal = 0;
458         last = 1;
459
460         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
461                 lvar_hash_t scan_hash;
462
463                 scan_hash = e_hash(scan);
464                 if (scan_hash < hash)
465                         found = scan;
466                 else if (scan_hash == hash) {
467                         if (e_eq(scan, name, namelen)) {
468                                 /*
469                                  * perfect match
470                                  */
471                                 leaf->il_at = lvar_lentry(scan);
472                                 return IAM_LOOKUP_EXACT;
473                         } else if (!found_equal) {
474                                         found = scan;
475                                         found_equal = 1;
476                         }
477                 } else {
478                         last = 0;
479                         break;
480                 }
481         }
482         if (found == NULL) {
483                 /*
484                  * @k is less than all hashes in the leaf.
485                  */
486                 lvar_start(leaf);
487                 result = IAM_LOOKUP_BEFORE;
488         } else {
489                 leaf->il_at = lvar_lentry(found);
490                 result = IAM_LOOKUP_OK;
491                 assert_corr(n_at_rec(leaf));
492         }
493         if (last)
494                 result |= IAM_LOOKUP_LAST;
495         assert_inv(n_invariant(leaf));
496
497         return result;
498 }
499
500 static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
501 {
502         struct lvar_leaf_entry *scan;
503         struct lvar_leaf_entry *end;
504         lvar_hash_t             hash;
505
506         assert_inv(n_invariant(leaf));
507         end  = n_end(leaf);
508         hash = *(const lvar_hash_t *)ik;
509
510         lvar_start(leaf);
511         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
512                 lvar_hash_t scan_hash;
513
514                 scan_hash = e_hash(scan);
515                 if (scan_hash > hash)
516                         return scan == n_start(leaf) ?
517                                 IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
518                 leaf->il_at = lvar_lentry(scan);
519                 if (scan_hash == hash)
520                         return IAM_LOOKUP_EXACT;
521         }
522         assert_inv(n_invariant(leaf));
523         /*
524          * @ik is greater than any key in the node. Return last record in the
525          * node.
526          */
527         return IAM_LOOKUP_OK;
528 }
529
530 static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
531 {
532         memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
533 }
534
535 static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
536 {
537         assert_corr(n_at_rec(l));
538         assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
539         assert_corr(iam_leaf_is_locked(l));
540         __lvar_key_set(l, k);
541         assert_inv(n_invariant(l));
542 }
543
544 static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
545 {
546         lvar_hash_t hash;
547         const char *name;
548
549         name = kchar(k);
550
551         hash = get_hash(iam_leaf_container(l), name, strlen(name));
552         return e_cmp(l, n_cur(l), hash);
553 }
554
555 static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
556 {
557         const char *name;
558
559         name = kchar(k);
560         return e_eq(n_cur(l), name, strlen(name));
561 }
562
563 static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
564 {
565         memcpy(e_rec(n_cur(l)), r, rec_size(r));
566 }
567
568 static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
569 {
570         assert_corr(n_at_rec(l));
571         assert_corr(iam_leaf_is_locked(l));
572         __lvar_rec_set(l, r);
573         assert_inv(n_invariant(l));
574 }
575
576 static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
577 {
578         struct iam_rec *rec;
579
580         rec = e_rec(n_cur(l));
581         assert_corr(n_at_rec(l));
582         assert_corr(iam_leaf_is_locked(l));
583         memcpy(r, rec, rec_size(rec));
584         assert_inv(n_invariant(l));
585 }
586
587 static int lvar_can_add(const struct iam_leaf *l,
588                         const struct iam_key *k, const struct iam_rec *r)
589 {
590         assert_corr(iam_leaf_is_locked(l));
591         return
592                 h_used(n_head(l)) +
593                 getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
594 }
595
596 static int lvar_at_end(const struct iam_leaf *folio)
597 {
598         assert_corr(iam_leaf_is_locked(folio));
599         return n_cur(folio) == n_end(folio);
600 }
601
602 static void lvar_rec_add(struct iam_leaf *leaf,
603                          const struct iam_key *k, const struct iam_rec *r)
604 {
605         const char *key;
606         int   ksize;
607         int   shift;
608         void *end;
609         void *start;
610         ptrdiff_t diff;
611
612         assert_corr(lvar_can_add(leaf, k, r));
613         assert_inv(n_invariant(leaf));
614         assert_corr(iam_leaf_is_locked(leaf));
615
616         key   = kchar(k);
617         ksize = strlen(key);
618         shift = getsize(leaf, ksize, rec_size(r));
619
620         if (!lvar_at_end(leaf)) {
621                 assert_corr(n_cur(leaf) < n_end(leaf));
622                 end = n_end(leaf);
623                 if (lvar_key_cmp(leaf, k) <= 0)
624                         lvar_next(leaf);
625                 else
626                         /*
627                          * Another exceptional case: insertion with the key
628                          * less than least key in the leaf.
629                          */
630                         assert_corr(leaf->il_at == leaf->il_entries);
631
632                 start = leaf->il_at;
633                 diff  = PDIFF(end, start);
634                 assert_corr(diff >= 0);
635                 memmove(start + shift, start, diff);
636         }
637         h_used_adj(leaf, n_head(leaf), shift);
638         n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
639         n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf),
640                                                      key, ksize));
641         __lvar_key_set(leaf, k);
642         __lvar_rec_set(leaf, r);
643         assert_corr(n_at_rec(leaf));
644         assert_inv(n_invariant(leaf));
645 }
646
647 static void lvar_rec_del(struct iam_leaf *leaf, int shift)
648 {
649         void *next;
650         void *end;
651         int nob;
652
653         assert_corr(n_at_rec(leaf));
654         assert_inv(n_invariant(leaf));
655         assert_corr(iam_leaf_is_locked(leaf));
656
657         end  = n_end(leaf);
658         next = e_next(leaf, n_cur(leaf));
659         nob  = e_size(leaf, n_cur(leaf));
660         memmove(leaf->il_at, next, end - next);
661         h_used_adj(leaf, n_head(leaf), -nob);
662         assert_inv(n_invariant(leaf));
663 }
664
665 static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
666 {
667         struct lvar_leaf_header *hdr;
668
669         hdr = (struct lvar_leaf_header *)bh->b_data;
670         hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
671         hdr->vlh_used  = sizeof *hdr;
672 }
673
674 static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
675                                           struct lvar_leaf_entry **prev)
676 {
677         void *scan;
678         void *start;
679         int threshold;
680
681         *prev = NULL;
682         threshold = blocksize(leaf) / 2;
683         for (scan = start = n_start(leaf); scan - start <= threshold;
684              *prev = scan, scan = e_next(leaf, scan)) {
685                 ;
686         }
687         return scan;
688 }
689
690 static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
691                        iam_ptr_t new_blknr)
692 {
693         struct lvar_leaf_entry  *first_to_move;
694         struct lvar_leaf_entry  *last_to_stay;
695         struct iam_path         *path;
696         struct lvar_leaf_header *hdr;
697         struct buffer_head      *new_leaf;
698
699         ptrdiff_t   tomove;
700         lvar_hash_t hash;
701
702         assert_inv(n_invariant(leaf));
703         assert_corr(iam_leaf_is_locked(leaf));
704
705         new_leaf = *bh;
706         path = iam_leaf_path(leaf);
707
708         hdr = (void *)new_leaf->b_data;
709
710         first_to_move = find_pivot(leaf, &last_to_stay);
711         assert_corr(last_to_stay != NULL);
712         assert_corr(e_next(leaf, last_to_stay) == first_to_move);
713
714         hash = e_hash(first_to_move);
715         if (hash == e_hash(last_to_stay))
716                 /*
717                  * Duplicate hash.
718                  */
719                 hash |= 1;
720
721         tomove = PDIFF(n_end(leaf), first_to_move);
722         memmove(hdr + 1, first_to_move, tomove);
723
724         h_used_adj(leaf, hdr, tomove);
725         h_used_adj(leaf, n_head(leaf), -tomove);
726
727         assert_corr(n_end(leaf) == first_to_move);
728
729         if (n_cur(leaf) >= first_to_move) {
730                 /*
731                  * insertion point moves into new leaf.
732                  */
733                 ptrdiff_t shift;
734                 int result;
735
736                 shift = PDIFF(leaf->il_at, first_to_move);
737                 *bh = leaf->il_bh;
738                 leaf->il_bh = new_leaf;
739                 leaf->il_curidx = new_blknr;
740
741                 assert_corr(iam_leaf_is_locked(leaf));
742                 result = lvar_init(leaf);
743                 /*
744                  * init cannot fail, as node was just initialized.
745                  */
746                 assert_corr(result == 0);
747                 leaf->il_at = ((void *)leaf->il_at) + shift;
748         }
749         /*
750          * Insert pointer to the new node (together with the least key in
751          * the node) into index node.
752          */
753         iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
754                             new_blknr);
755         assert_corr(n_cur(leaf) < n_end(leaf));
756         assert_inv(n_invariant(leaf));
757 }
758
759 static struct iam_leaf_operations lvar_leaf_ops = {
760         .init           = lvar_init,
761         .init_new       = lvar_init_new,
762         .fini           = lvar_fini,
763         .start          = lvar_start,
764         .next           = lvar_next,
765         .key            = lvar_key,
766         .ikey           = lvar_ikey,
767         .rec            = lvar_rec,
768         .key_set        = lvar_key_set,
769         .key_cmp        = lvar_key_cmp,
770         .key_eq         = lvar_key_eq,
771         .key_size       = lvar_key_size,
772         .rec_set        = lvar_rec_set,
773         .rec_get        = lvar_rec_get,
774         .lookup         = lvar_lookup,
775         .ilookup        = lvar_ilookup,
776         .at_end         = lvar_at_end,
777         .rec_add        = lvar_rec_add,
778         .rec_del        = lvar_rec_del,
779         .can_add        = lvar_can_add,
780         .split          = lvar_split
781 };
782
783 /*
784  * Index operations.
785  */
786
787 enum {
788         /* This is duplicated in lustre/utils/create_iam.c */
789         /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
790         IAM_LVAR_ROOT_MAGIC = 0xb01dface
791 };
792
793 /* This is duplicated in lustre/utils/create_iam.c */
794 struct lvar_root {
795         __le32 vr_magic;
796         __le16 vr_recsize;
797         __le16 vr_ptrsize;
798         u8     vr_indirect_levels;
799         u8     vr_padding0;
800         __le16 vr_padding1;
801 };
802
803 static __u32 lvar_root_ptr(struct iam_container *c)
804 {
805         return 0;
806 }
807
808 static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
809                           int root)
810 {
811         return 0;
812 }
813
814 static struct iam_entry *lvar_root_inc(struct iam_container *c,
815                                        struct iam_path *path,
816                                        struct iam_frame *frame)
817 {
818         struct lvar_root *root;
819         struct iam_entry *entries;
820
821         assert_corr(iam_frame_is_locked(path, frame));
822         entries = frame->entries;
823
824         dx_set_count(entries, 2);
825         assert_corr(dx_get_limit(entries) == dx_root_limit(path));
826
827         root = (void *)frame->bh->b_data;
828         assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
829         root->vr_indirect_levels ++;
830         frame->at = entries = iam_entry_shift(path, entries, 1);
831         memset(iam_ikey_at(path, entries), 0,
832                iam_path_descr(path)->id_ikey_size);
833         return entries;
834 }
835
836 static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
837 {
838         unsigned count;
839         unsigned limit;
840         unsigned limit_correct;
841         struct iam_entry *entries;
842
843         entries = dx_node_get_entries(path, frame);
844
845         if (frame == path->ip_frames) {
846                 struct lvar_root *root;
847
848                 root = (void *)frame->bh->b_data;
849                 if (le64_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
850                         return -EIO;
851                 limit_correct = dx_root_limit(path);
852         } else
853                 limit_correct = dx_node_limit(path);
854         count = dx_get_count(entries);
855         limit = dx_get_limit(entries);
856         if (count > limit)
857                 return -EIO;
858         if (limit != limit_correct)
859                 return -EIO;
860         return 0;
861 }
862
863 static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
864 {
865         struct iam_entry *entries;
866         void *data;
867         entries = dx_node_get_entries(path, frame);
868
869         data = frame->bh->b_data;
870
871         if (frame == path->ip_frames) {
872                 struct lvar_root *root;
873                 const char *name;
874
875                 root = data;
876                 name = kchar(path->ip_key_target);
877                 path->ip_indirect = root->vr_indirect_levels;
878                 if (path->ip_ikey_target == NULL) {
879                         path->ip_ikey_target = iam_path_ikey(path, 4);
880                         *(lvar_hash_t *)path->ip_ikey_target =
881                                 get_hash(path->ip_container, name,
882                                          strlen(name));
883                 }
884         }
885         frame->entries = frame->at = entries;
886         return 0;
887 }
888
889 static int lvar_ikeycmp(const struct iam_container *c,
890                         const struct iam_ikey *k1, const struct iam_ikey *k2)
891 {
892         lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
893         lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
894
895         return p1 > p2 ? +1 : (p1 < p2 ? -1 : 0);
896 }
897
898 static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
899                                              void *area)
900 {
901         return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
902 }
903
904 static void lvar_root(void *buf,
905                       int blocksize, int keysize, int ptrsize, int recsize)
906 {
907         struct lvar_root *root;
908         struct dx_countlimit *limit;
909         void                 *entry;
910         int isize;
911
912         isize = sizeof(lvar_hash_t) + ptrsize;
913         root = buf;
914         *root = (typeof(*root)) {
915                 .vr_magic            = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
916                 .vr_recsize          = cpu_to_le16(recsize),
917                 .vr_ptrsize          = cpu_to_le16(ptrsize),
918                 .vr_indirect_levels  = 0
919         };
920
921         limit = (void *)(root + 1);
922         *limit = (typeof(*limit)){
923                 /*
924                  * limit itself + one pointer to the leaf.
925                  */
926                 .count = cpu_to_le16(2),
927                 .limit = iam_root_limit(sizeof(struct lvar_root), blocksize,
928                                         sizeof (lvar_hash_t) + ptrsize)
929         };
930
931         entry = root + 1;
932         /*
933          * Skip over @limit.
934          */
935         entry += isize;
936
937         /*
938          * Entry format is <key> followed by <ptr>. In the minimal tree
939          * consisting of a root and single node, <key> is a minimal possible
940          * key.
941          */
942         *(lvar_hash_t *)entry = 0;
943         entry += sizeof(lvar_hash_t);
944         /* now @entry points to <ptr> */
945         if (ptrsize == 4)
946                 *(u_int32_t *)entry = cpu_to_le32(1);
947         else
948                 *(u_int64_t *)entry = cpu_to_le64(1);
949 }
950
951 static int lvar_esize(int namelen, int recsize)
952 {
953         return (offsetof(struct lvar_leaf_entry, vle_key) +
954                 namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
955 }
956
957 static void lvar_leaf(void *buf,
958                       int blocksize, int keysize, int ptrsize, int recsize)
959 {
960         struct lvar_leaf_header *head;
961         struct lvar_leaf_entry  *entry;
962
963         /* form leaf */
964         head = buf;
965         *head = (typeof(*head)) {
966                 .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
967                 .vlh_used  = cpu_to_le16(sizeof *head + lvar_esize(0, recsize))
968         };
969         entry = (void *)(head + 1);
970         *entry = (typeof(*entry)) {
971                 .vle_hash    = 0,
972                 .vle_keysize = 0
973         };
974         memset(e_rec(entry), 0, recsize);
975         *(char *)e_rec(entry) = recsize;
976 }
977
978 int iam_lvar_create(struct inode *obj,
979                     int keysize, int ptrsize, int recsize, handle_t *handle)
980 {
981         struct buffer_head *root_node;
982         struct buffer_head *leaf_node;
983         struct super_block *sb;
984
985         u32 blknr;
986         int result;
987         unsigned long bsize;
988
989         assert_corr(obj->i_size == 0);
990
991         sb = obj->i_sb;
992         bsize = sb->s_blocksize;
993         root_node = ldiskfs_append(handle, obj, &blknr, &result);
994         leaf_node = ldiskfs_append(handle, obj, &blknr, &result);
995         if (root_node != NULL && leaf_node != NULL) {
996                 lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
997                 lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
998                 ldiskfs_mark_inode_dirty(handle, obj);
999                 result = ldiskfs_journal_dirty_metadata(handle, root_node);
1000                 if (result == 0)
1001                         result = ldiskfs_journal_dirty_metadata(handle, leaf_node);
1002                 if (result != 0)
1003                         ldiskfs_std_error(sb, result);
1004         }
1005         brelse(leaf_node);
1006         brelse(root_node);
1007         return result;
1008 }
1009 EXPORT_SYMBOL(iam_lvar_create);
1010
1011 static struct iam_operations lvar_ops = {
1012         .id_root_ptr    = lvar_root_ptr,
1013         .id_node_read   = iam_node_read,
1014         .id_node_init   = lvar_node_init,
1015         .id_node_check  = lvar_node_check,
1016         .id_node_load   = lvar_node_load,
1017         .id_ikeycmp     = lvar_ikeycmp,
1018         .id_root_inc    = lvar_root_inc,
1019         .id_ipd_alloc   = lvar_ipd_alloc,
1020         .id_ipd_free    = iam_ipd_free,
1021         .id_name        = "lvar"
1022 };
1023
1024 static int lvar_guess(struct iam_container *c)
1025 {
1026         int result;
1027         struct buffer_head *bh;
1028         const struct lvar_root *root;
1029
1030         assert_corr(c->ic_object != NULL);
1031
1032         result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
1033         if (result == 0) {
1034                 root = (void *)bh->b_data;
1035                 if (le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
1036                         struct iam_descr *descr;
1037
1038                         descr = c->ic_descr;
1039                         descr->id_key_size  = LDISKFS_NAME_LEN;
1040                         descr->id_ikey_size = sizeof (lvar_hash_t);
1041                         descr->id_rec_size  = le16_to_cpu(root->vr_recsize);
1042                         descr->id_ptr_size  = le16_to_cpu(root->vr_ptrsize);
1043                         descr->id_root_gap  = sizeof *root;
1044                         descr->id_node_gap  = 0;
1045                         descr->id_ops       = &lvar_ops;
1046                         descr->id_leaf_ops  = &lvar_leaf_ops;
1047                 } else
1048                         result = -EBADF;
1049                 brelse(bh);
1050         }
1051         return result;
1052 }
1053
1054 static struct iam_format lvar_format = {
1055         .if_guess = lvar_guess
1056 };
1057
1058 void iam_lvar_format_init(void)
1059 {
1060         iam_format_register(&lvar_format);
1061 }
1062