Whamcloud - gitweb
031cd33de352b5154683f89d43907d747062a652
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_iam_lvar.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2015, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * iam_lvar.c
33  *
34  * implementation of iam format for fixed size records, variable sized keys.
35  *
36  * Author: Nikita Danilov <nikita@clusterfs.com>
37  */
38
39 #include <linux/types.h>
40 #include "osd_internal.h"
41
42 /*
43  * Leaf operations.
44  */
45
46 enum {
47         /* This is duplicated in lustre/utils/create_iam.c */
48         IAM_LVAR_LEAF_MAGIC = 0x1973
49 };
50
51 /* This is duplicated in lustre/utils/create_iam.c */
52 struct lvar_leaf_header {
53         __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
54         __le16 vlh_used;  /* used bytes, including header */
55 };
56
57 /*
58  * Format of leaf entry:
59  *
60  * __le16 keysize
61  *     u8 key[keysize]
62  *     u8 record[rec_size]
63  *
64  * Entries are ordered in key order.
65  */
66
67 /* This is duplicated in lustre/utils/create_iam.c */
68 typedef u32 lvar_hash_t;
69
70 /* This is duplicated in lustre/utils/create_iam.c */
71 struct lvar_leaf_entry {
72         __le32 vle_hash;
73         __le16 vle_keysize;
74         u8 vle_key[0];
75 };
76
77 #define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
78
79
80 static inline int blocksize(const struct iam_leaf *leaf)
81 {
82         return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
83 }
84
85 static inline const char *kchar(const struct iam_key *key)
86 {
87         return (void *)key;
88 }
89
90 static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
91 {
92         return (struct iam_lentry *)ent;
93 }
94
95 static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
96 {
97         return (struct lvar_leaf_entry *)lent;
98 }
99
100
101 static inline int e_keysize(const struct lvar_leaf_entry *ent)
102 {
103         return le16_to_cpu(ent->vle_keysize);
104 }
105
106 /* This is duplicated in lustre/utils/create_iam.c */
107 enum {
108         LVAR_PAD   = 4,
109         LVAR_ROUND = LVAR_PAD - 1
110 };
111
112 static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
113 {
114         BUILD_BUG_ON((LVAR_PAD & (LVAR_PAD - 1)));
115
116         return (offsetof(struct lvar_leaf_entry, vle_key) +
117                         namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
118 }
119
120 static inline int rec_size(const struct iam_rec *rec)
121 {
122         return *(const char *)rec;
123 }
124
125 static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
126 {
127         return ((void *)ent) +
128                 offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
129 }
130
131 static inline int e_size(const struct iam_leaf *leaf,
132                          const struct lvar_leaf_entry *ent)
133 {
134         return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
135 }
136
137 static inline char *e_char(const struct lvar_leaf_entry *ent)
138 {
139         return (char *)&ent->vle_key;
140 }
141
142 static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
143 {
144         return (struct iam_key *)e_char(ent);
145 }
146
147 static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
148 {
149         return le32_to_cpu(ent->vle_hash);
150 }
151
152 static void e_print(const struct lvar_leaf_entry *ent)
153 {
154         CERROR("        %p %8.8x \"%*.*s\"\n", ent, e_hash(ent),
155                         e_keysize(ent), e_keysize(ent), e_char(ent));
156 }
157
158 static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
159                                              const struct lvar_leaf_entry *ent)
160 {
161         return ((void *)ent) + e_size(leaf, ent);
162 }
163
164 #define LVAR_HASH_SANDWICH  (0)
165 #define LVAR_HASH_TEA       (1)
166 #define LVAR_HASH_R5        (0)
167 #define LVAR_HASH_PREFIX    (0)
168
169 static u32 hash_build0(const char *name, int namelen)
170 {
171         u32 result;
172
173         if (namelen == 0)
174                 return 0;
175         if (strncmp(name, ".", 1) == 0 && namelen == 1)
176                 return 1;
177         if (strncmp(name, "..", 2) == 0 && namelen == 2)
178                 return 2;
179
180         if (LVAR_HASH_PREFIX) {
181                 result = 0;
182                 strncpy((void *)&result,
183                         name, min_t(int, namelen, sizeof(result)));
184         } else {
185                 struct ldiskfs_dx_hash_info hinfo;
186
187                 hinfo.hash_version = LDISKFS_DX_HASH_TEA;
188                 hinfo.seed = NULL;
189                 ldiskfsfs_dirhash(name, namelen, &hinfo);
190                 result = hinfo.hash;
191                 if (LVAR_HASH_SANDWICH) {
192                         u32 result2;
193
194                         hinfo.hash_version = LDISKFS_DX_HASH_TEA;
195                         hinfo.seed = NULL;
196                         ldiskfsfs_dirhash(name, namelen, &hinfo);
197                         result2 = hinfo.hash;
198                         result = (0xfc000000 & result2) | (0x03ffffff & result);
199                 }
200         }
201         return result;
202 }
203
204 enum {
205         HASH_GRAY_AREA = 1024,
206         HASH_MAX_SIZE  = 0x7fffffffUL
207 };
208
209 static u32 hash_build(const char *name, int namelen)
210 {
211         u32 hash;
212
213         hash = (hash_build0(name, namelen) << 1) & HASH_MAX_SIZE;
214         if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA)
215                 hash &= HASH_GRAY_AREA - 1;
216         return hash;
217 }
218
219 static inline lvar_hash_t get_hash(const struct iam_container *bag,
220                                    const char *name, int namelen)
221 {
222         return hash_build(name, namelen);
223 }
224
225 static inline int e_eq(const struct lvar_leaf_entry *ent,
226                        const char *name, int namelen)
227 {
228         return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
229 }
230
231 static inline int e_cmp(const struct iam_leaf *leaf,
232                         const struct lvar_leaf_entry *ent, lvar_hash_t hash)
233 {
234         lvar_hash_t ehash;
235
236         ehash = e_hash(ent);
237         return ehash == hash ? 0 : (ehash < hash ? -1 : 1);
238 }
239
240 static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
241 {
242         return (struct lvar_leaf_header *)l->il_bh->b_data;
243 }
244
245 static int h_used(const struct lvar_leaf_header *hdr)
246 {
247         return le16_to_cpu(hdr->vlh_used);
248 }
249
250 static void h_used_adj(const struct iam_leaf *leaf,
251                        struct lvar_leaf_header *hdr, int adj)
252 {
253         int used;
254
255         used = h_used(hdr) + adj;
256         assert_corr(sizeof(*hdr) <= used && used <= blocksize(leaf));
257         hdr->vlh_used = cpu_to_le16(used);
258 }
259
260 static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
261 {
262         return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
263 }
264
265 static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
266 {
267         return (void *)l->il_bh->b_data + h_used(n_head(l));
268 }
269
270 static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
271 {
272         return lentry_lvar(l->il_at);
273 }
274
275 void n_print(const struct iam_leaf *l)
276 {
277         struct lvar_leaf_entry *scan;
278
279         CERROR("used: %d\n", h_used(n_head(l)));
280         for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan))
281                 e_print(scan);
282 }
283
284 #if LDISKFS_CORRECTNESS_ON
285 static int n_at_rec(const struct iam_leaf *folio)
286 {
287         return n_start(folio) <= lentry_lvar(folio->il_at) &&
288                 lentry_lvar(folio->il_at) < n_end(folio);
289 }
290
291 #if LDISKFS_INVARIANT_ON
292 static int n_invariant(const struct iam_leaf *leaf)
293 {
294         struct iam_path *path;
295         struct lvar_leaf_entry *scan;
296         struct lvar_leaf_entry *end;
297         lvar_hash_t hash;
298         lvar_hash_t nexthash;
299         lvar_hash_t starthash;
300
301         end  = n_end(leaf);
302         hash = 0;
303         path = leaf->il_path;
304
305         if (h_used(n_head(leaf)) > blocksize(leaf))
306                 return 0;
307
308         /*
309          * Delimiting key in the parent index node. Clear least bit to account
310          * for hash collision marker.
311          */
312         starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
313         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
314                 nexthash = e_hash(scan);
315                 if (nexthash != get_hash(iam_leaf_container(leaf),
316                                         e_char(scan), e_keysize(scan))) {
317                         BREAKPOINT();
318                         return 0;
319                 }
320                 if (0 && nexthash < starthash) {
321                         /*
322                          * Unfortunately this useful invariant cannot be
323                          * reliably checked as parent node is not necessarily
324                          * locked.
325                          */
326                         n_print(leaf);
327                         CERROR("%#x < %#x\n", nexthash, starthash);
328                         dump_stack();
329                         return 0;
330                 }
331                 if (nexthash < hash) {
332                         BREAKPOINT();
333                         return 0;
334                 }
335                 hash = nexthash;
336         }
337         if (scan != end) {
338                 BREAKPOINT();
339                 return 0;
340         }
341         return 1;
342 }
343 /* LDISKFS_INVARIANT_ON */
344 #endif
345
346 /* LDISKFS_CORRECTNESS_ON */
347 #endif
348
349 static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
350                                   struct iam_ikey *key)
351 {
352         lvar_hash_t *hash;
353
354         assert_corr(n_at_rec(l));
355
356         hash = (void *)key;
357         *hash = e_hash(n_cur(l));
358         return key;
359 }
360
361 static struct iam_key *lvar_key(const struct iam_leaf *l)
362 {
363         return e_key(n_cur(l));
364 }
365
366 static int lvar_key_size(const struct iam_leaf *l)
367 {
368         return e_keysize(n_cur(l));
369 }
370
371 static void lvar_start(struct iam_leaf *l)
372 {
373         l->il_at = lvar_lentry(n_start(l));
374 }
375
376 static int lvar_init(struct iam_leaf *l)
377 {
378         int result;
379         int used;
380         struct lvar_leaf_header *head;
381
382         assert_corr(l->il_bh != NULL);
383
384         head = n_head(l);
385         used = h_used(head);
386         if (le16_to_cpu(head->vlh_magic) == IAM_LVAR_LEAF_MAGIC &&
387                         used <= blocksize(l)) {
388                 l->il_at = l->il_entries = lvar_lentry(n_start(l));
389                 result = 0;
390         } else {
391                 struct inode *obj;
392
393                 obj = iam_leaf_container(l)->ic_object;
394                 CERROR(
395                 "Bad magic in node %llu (#%lu): %#x != %#x or wrong used: %d\n",
396                 (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
397                 le16_to_cpu(head->vlh_magic), IAM_LVAR_LEAF_MAGIC,
398                 used);
399                 result = -EIO;
400         }
401         return result;
402 }
403
404 static void lvar_fini(struct iam_leaf *l)
405 {
406         l->il_entries = l->il_at = NULL;
407 }
408
409 static struct iam_rec *lvar_rec(const struct iam_leaf *l)
410 {
411         assert_corr(n_at_rec(l));
412         return e_rec(n_cur(l));
413 }
414
415 static void lvar_next(struct iam_leaf *l)
416 {
417         assert_corr(n_at_rec(l));
418         assert_corr(iam_leaf_is_locked(l));
419         l->il_at = lvar_lentry(e_next(l, n_cur(l)));
420 }
421
422 static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
423 {
424         struct lvar_leaf_entry *found;
425         struct lvar_leaf_entry *scan;
426         struct lvar_leaf_entry *end;
427         int result;
428         const char *name;
429         int namelen;
430         int found_equal;
431         lvar_hash_t hash;
432         int last;
433
434         assert_inv(n_invariant(leaf));
435         end = n_end(leaf);
436
437         name = kchar(k);
438         namelen = strlen(name);
439         hash = get_hash(iam_leaf_container(leaf), name, namelen);
440         found = NULL;
441         found_equal = 0;
442         last = 1;
443
444         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
445                 lvar_hash_t scan_hash;
446
447                 scan_hash = e_hash(scan);
448                 if (scan_hash < hash)
449                         found = scan;
450                 else if (scan_hash == hash) {
451                         if (e_eq(scan, name, namelen)) {
452                                 /*
453                                  * perfect match
454                                  */
455                                 leaf->il_at = lvar_lentry(scan);
456                                 return IAM_LOOKUP_EXACT;
457                         } else if (!found_equal) {
458                                 found = scan;
459                                 found_equal = 1;
460                         }
461                 } else {
462                         last = 0;
463                         break;
464                 }
465         }
466         if (found == NULL) {
467                 /*
468                  * @k is less than all hashes in the leaf.
469                  */
470                 lvar_start(leaf);
471                 result = IAM_LOOKUP_BEFORE;
472         } else {
473                 leaf->il_at = lvar_lentry(found);
474                 result = IAM_LOOKUP_OK;
475                 assert_corr(n_at_rec(leaf));
476         }
477         if (last)
478                 result |= IAM_LOOKUP_LAST;
479         assert_inv(n_invariant(leaf));
480
481         return result;
482 }
483
484 static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
485 {
486         struct lvar_leaf_entry *scan;
487         struct lvar_leaf_entry *end;
488         lvar_hash_t hash;
489
490         assert_inv(n_invariant(leaf));
491         end  = n_end(leaf);
492         hash = *(const lvar_hash_t *)ik;
493
494         lvar_start(leaf);
495         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
496                 lvar_hash_t scan_hash;
497
498                 scan_hash = e_hash(scan);
499                 if (scan_hash > hash)
500                         return scan == n_start(leaf) ?
501                                 IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
502                 leaf->il_at = lvar_lentry(scan);
503                 if (scan_hash == hash)
504                         return IAM_LOOKUP_EXACT;
505         }
506         assert_inv(n_invariant(leaf));
507         /*
508          * @ik is greater than any key in the node. Return last record in the
509          * node.
510          */
511         return IAM_LOOKUP_OK;
512 }
513
514 static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
515 {
516         memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
517 }
518
519 static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
520 {
521         assert_corr(n_at_rec(l));
522         assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
523         assert_corr(iam_leaf_is_locked(l));
524         __lvar_key_set(l, k);
525         assert_inv(n_invariant(l));
526 }
527
528 static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
529 {
530         lvar_hash_t hash;
531         const char *name;
532
533         name = kchar(k);
534
535         hash = get_hash(iam_leaf_container(l), name, strlen(name));
536         return e_cmp(l, n_cur(l), hash);
537 }
538
539 static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
540 {
541         const char *name;
542
543         name = kchar(k);
544         return e_eq(n_cur(l), name, strlen(name));
545 }
546
547 static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
548 {
549         memcpy(e_rec(n_cur(l)), r, rec_size(r));
550 }
551
552 static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
553 {
554         assert_corr(n_at_rec(l));
555         assert_corr(iam_leaf_is_locked(l));
556         __lvar_rec_set(l, r);
557         assert_inv(n_invariant(l));
558 }
559
560 static int lvar_rec_eq(const struct iam_leaf *l, const struct iam_rec *r)
561 {
562         struct iam_rec *rec = e_rec(n_cur(l));
563
564         if (rec_size(rec) != rec_size(r))
565                 return 0;
566         return !memcmp(rec, r, rec_size(r));
567 }
568
569 static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
570 {
571         struct iam_rec *rec;
572
573         rec = e_rec(n_cur(l));
574         assert_corr(n_at_rec(l));
575         assert_corr(iam_leaf_is_locked(l));
576         memcpy(r, rec, rec_size(rec));
577         assert_inv(n_invariant(l));
578 }
579
580 static int lvar_can_add(const struct iam_leaf *l,
581                         const struct iam_key *k, const struct iam_rec *r)
582 {
583         assert_corr(iam_leaf_is_locked(l));
584         return h_used(n_head(l)) +
585                 getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
586 }
587
588 static int lvar_at_end(const struct iam_leaf *folio)
589 {
590         assert_corr(iam_leaf_is_locked(folio));
591         return n_cur(folio) == n_end(folio);
592 }
593
594 static void lvar_rec_add(struct iam_leaf *leaf,
595                          const struct iam_key *k, const struct iam_rec *r)
596 {
597         const char *key;
598         int ksize;
599         int shift;
600         void *end;
601         void *start;
602         ptrdiff_t diff;
603
604         assert_corr(lvar_can_add(leaf, k, r));
605         assert_inv(n_invariant(leaf));
606         assert_corr(iam_leaf_is_locked(leaf));
607
608         key   = kchar(k);
609         ksize = strlen(key);
610         shift = getsize(leaf, ksize, rec_size(r));
611
612         if (!lvar_at_end(leaf)) {
613                 assert_corr(n_cur(leaf) < n_end(leaf));
614                 end = n_end(leaf);
615                 if (lvar_key_cmp(leaf, k) <= 0)
616                         lvar_next(leaf);
617                 else
618                         /*
619                          * Another exceptional case: insertion with the key
620                          * less than least key in the leaf.
621                          */
622                         assert_corr(leaf->il_at == leaf->il_entries);
623
624                 start = leaf->il_at;
625                 diff  = PDIFF(end, start);
626                 assert_corr(diff >= 0);
627                 memmove(start + shift, start, diff);
628         }
629         h_used_adj(leaf, n_head(leaf), shift);
630         n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
631         n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf),
632                                             key, ksize));
633         __lvar_key_set(leaf, k);
634         __lvar_rec_set(leaf, r);
635         assert_corr(n_at_rec(leaf));
636         assert_inv(n_invariant(leaf));
637 }
638
639 static void lvar_rec_del(struct iam_leaf *leaf, int shift)
640 {
641         void *next;
642         void *end;
643         int nob;
644
645         assert_corr(n_at_rec(leaf));
646         assert_inv(n_invariant(leaf));
647         assert_corr(iam_leaf_is_locked(leaf));
648
649         end  = n_end(leaf);
650         next = e_next(leaf, n_cur(leaf));
651         nob  = e_size(leaf, n_cur(leaf));
652         memmove(leaf->il_at, next, end - next);
653         h_used_adj(leaf, n_head(leaf), -nob);
654         assert_inv(n_invariant(leaf));
655 }
656
657 static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
658 {
659         struct lvar_leaf_header *hdr;
660
661         hdr = (struct lvar_leaf_header *)bh->b_data;
662         hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
663         hdr->vlh_used  = sizeof(*hdr);
664 }
665
666 static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
667                                           struct lvar_leaf_entry **prev)
668 {
669         void *scan;
670         void *start;
671         int threshold;
672
673         *prev = NULL;
674         threshold = blocksize(leaf) / 2;
675         for (scan = start = n_start(leaf); scan - start <= threshold;
676                         *prev = scan, scan = e_next(leaf, scan)) {
677                 ;
678         }
679         return scan;
680 }
681
682 static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
683                        iam_ptr_t new_blknr)
684 {
685         struct lvar_leaf_entry *first_to_move;
686         struct lvar_leaf_entry *last_to_stay;
687         struct iam_path *path;
688         struct lvar_leaf_header *hdr;
689         struct buffer_head *new_leaf;
690         ptrdiff_t tomove;
691         lvar_hash_t hash;
692
693         assert_inv(n_invariant(leaf));
694         assert_corr(iam_leaf_is_locked(leaf));
695
696         new_leaf = *bh;
697         path = iam_leaf_path(leaf);
698
699         hdr = (void *)new_leaf->b_data;
700
701         first_to_move = find_pivot(leaf, &last_to_stay);
702         assert_corr(last_to_stay != NULL);
703         assert_corr(e_next(leaf, last_to_stay) == first_to_move);
704
705         hash = e_hash(first_to_move);
706         if (hash == e_hash(last_to_stay))
707                 /*
708                  * Duplicate hash.
709                  */
710                 hash |= 1;
711
712         tomove = PDIFF(n_end(leaf), first_to_move);
713         memmove(hdr + 1, first_to_move, tomove);
714
715         h_used_adj(leaf, hdr, tomove);
716         h_used_adj(leaf, n_head(leaf), -tomove);
717
718         assert_corr(n_end(leaf) == first_to_move);
719
720         if (n_cur(leaf) >= first_to_move) {
721                 /*
722                  * insertion point moves into new leaf.
723                  */
724                 ptrdiff_t shift;
725
726                 shift = PDIFF(leaf->il_at, first_to_move);
727                 *bh = leaf->il_bh;
728                 leaf->il_bh = new_leaf;
729                 leaf->il_curidx = new_blknr;
730
731                 assert_corr(iam_leaf_is_locked(leaf));
732                 lvar_init(leaf);
733                 /*
734                  * init cannot fail, as node was just initialized.
735                  */
736                 assert_corr(result == 0);
737                 leaf->il_at = ((void *)leaf->il_at) + shift;
738         }
739         /*
740          * Insert pointer to the new node (together with the least key in
741          * the node) into index node.
742          */
743         iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
744                             new_blknr);
745         assert_corr(n_cur(leaf) < n_end(leaf));
746         assert_inv(n_invariant(leaf));
747 }
748
749 static int lvar_leaf_empty(struct iam_leaf *leaf)
750 {
751         return h_used(n_head(leaf)) == sizeof(struct lvar_leaf_header);
752 }
753
754 static struct iam_leaf_operations lvar_leaf_ops = {
755         .init           = lvar_init,
756         .init_new       = lvar_init_new,
757         .fini           = lvar_fini,
758         .start          = lvar_start,
759         .next           = lvar_next,
760         .key            = lvar_key,
761         .ikey           = lvar_ikey,
762         .rec            = lvar_rec,
763         .key_set        = lvar_key_set,
764         .key_cmp        = lvar_key_cmp,
765         .key_eq         = lvar_key_eq,
766         .key_size       = lvar_key_size,
767         .rec_set        = lvar_rec_set,
768         .rec_eq         = lvar_rec_eq,
769         .rec_get        = lvar_rec_get,
770         .lookup         = lvar_lookup,
771         .ilookup        = lvar_ilookup,
772         .at_end         = lvar_at_end,
773         .rec_add        = lvar_rec_add,
774         .rec_del        = lvar_rec_del,
775         .can_add        = lvar_can_add,
776         .split          = lvar_split,
777         .leaf_empty     = lvar_leaf_empty,
778 };
779
780 /*
781  * Index operations.
782  */
783
784 enum {
785         /* This is duplicated in lustre/utils/create_iam.c */
786         /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
787         IAM_LVAR_ROOT_MAGIC = 0xb01dface
788 };
789
790 /* This is duplicated in lustre/utils/create_iam.c */
791 struct lvar_root {
792         __le32 vr_magic;
793         __le16 vr_recsize;
794         __le16 vr_ptrsize;
795         u8 vr_indirect_levels;
796         u8 vr_padding0;
797         __le16 vr_padding1;
798 };
799
800 static u32 lvar_root_ptr(struct iam_container *c)
801 {
802         return 0;
803 }
804
805 static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
806                           int root)
807 {
808         return 0;
809 }
810
811 static struct iam_entry *lvar_root_inc(struct iam_container *c,
812                                        struct iam_path *path,
813                                        struct iam_frame *frame)
814 {
815         struct lvar_root *root;
816         struct iam_entry *entries;
817
818         assert_corr(iam_frame_is_locked(path, frame));
819         entries = frame->entries;
820
821         dx_set_count(entries, 2);
822         assert_corr(dx_get_limit(entries) == dx_root_limit(path));
823
824         root = (void *)frame->bh->b_data;
825         assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
826         root->vr_indirect_levels++;
827         frame->at = entries = iam_entry_shift(path, entries, 1);
828         memset(iam_ikey_at(path, entries), 0,
829                iam_path_descr(path)->id_ikey_size);
830         return entries;
831 }
832
833 static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
834 {
835         unsigned int count;
836         unsigned int limit;
837         unsigned int limit_correct;
838         struct iam_entry *entries;
839
840         entries = dx_node_get_entries(path, frame);
841
842         if (frame == path->ip_frames) {
843                 struct lvar_root *root;
844
845                 root = (void *)frame->bh->b_data;
846                 if (le32_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
847                         return -EIO;
848                 limit_correct = dx_root_limit(path);
849         } else
850                 limit_correct = dx_node_limit(path);
851         count = dx_get_count(entries);
852         limit = dx_get_limit(entries);
853         if (count > limit)
854                 return -EIO;
855         if (limit != limit_correct)
856                 return -EIO;
857         return 0;
858 }
859
860 static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
861 {
862         struct iam_entry *entries;
863         void *data;
864
865         entries = dx_node_get_entries(path, frame);
866         data = frame->bh->b_data;
867
868         if (frame == path->ip_frames) {
869                 struct lvar_root *root;
870                 const char *name;
871
872                 root = data;
873                 name = kchar(path->ip_key_target);
874                 path->ip_indirect = root->vr_indirect_levels;
875                 if (path->ip_ikey_target == NULL) {
876                         path->ip_ikey_target = iam_path_ikey(path, 4);
877                         *(lvar_hash_t *)path->ip_ikey_target =
878                                 get_hash(path->ip_container, name,
879                                          strlen(name));
880                 }
881         }
882         frame->entries = frame->at = entries;
883         return 0;
884 }
885
886 static int lvar_ikeycmp(const struct iam_container *c,
887                         const struct iam_ikey *k1, const struct iam_ikey *k2)
888 {
889         lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
890         lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
891
892         return p1 > p2 ? 1 : (p1 < p2 ? -1 : 0);
893 }
894
895 static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
896                                              void *area)
897 {
898         return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
899 }
900
901 static void lvar_root(void *buf,
902                       int blocksize, int keysize, int ptrsize, int recsize)
903 {
904         struct lvar_root *root;
905         struct dx_countlimit *limit;
906         void *entry;
907         int isize;
908
909         isize = sizeof(lvar_hash_t) + ptrsize;
910         root = buf;
911         *root = (typeof(*root)) {
912                 .vr_magic            = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
913                 .vr_recsize          = cpu_to_le16(recsize),
914                 .vr_ptrsize          = cpu_to_le16(ptrsize),
915                 .vr_indirect_levels  = 0
916         };
917
918         limit = (void *)(root + 1);
919         *limit = (typeof(*limit)){
920                 /*
921                  * limit itself + one pointer to the leaf.
922                  */
923                 .count = cpu_to_le16(2),
924                 .limit = iam_root_limit(sizeof(struct lvar_root), blocksize,
925                                         sizeof(lvar_hash_t) + ptrsize)
926         };
927
928         /* To guarantee that the padding "keysize + ptrsize"
929          * covers the "dx_countlimit" and the "idle_blocks". */
930         LASSERT((keysize + ptrsize) >=
931                 (sizeof(struct dx_countlimit) + sizeof(u32)));
932
933         entry = (void *)(limit + 1);
934         /* Put "idle_blocks" just after the limit. There was padding after
935          * the limit, the "idle_blocks" re-uses part of the padding, so no
936          * compatibility issues with old layout.
937          */
938         *(u32 *)entry = 0;
939
940         /*
941          * Skip over @limit.
942          */
943         entry = (void *)(root + 1) + isize;
944
945         /*
946          * Entry format is <key> followed by <ptr>. In the minimal tree
947          * consisting of a root and single node, <key> is a minimal possible
948          * key.
949          */
950         *(lvar_hash_t *)entry = 0;
951         entry += sizeof(lvar_hash_t);
952         /* now @entry points to <ptr> */
953         if (ptrsize == 4)
954                 *(u_int32_t *)entry = cpu_to_le32(1);
955         else
956                 *(u_int64_t *)entry = cpu_to_le64(1);
957 }
958
959 static int lvar_esize(int namelen, int recsize)
960 {
961         return (offsetof(struct lvar_leaf_entry, vle_key) +
962                         namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
963 }
964
965 static void lvar_leaf(void *buf,
966                       int blocksize, int keysize, int ptrsize, int recsize)
967 {
968         struct lvar_leaf_header *head;
969         struct lvar_leaf_entry *entry;
970
971         /* form leaf */
972         head = buf;
973         *head = (typeof(*head)) {
974                 .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
975                 .vlh_used  = cpu_to_le16(sizeof(*head) + lvar_esize(0, recsize))
976         };
977         entry = (void *)(head + 1);
978         *entry = (typeof(*entry)) {
979                 .vle_hash    = 0,
980                 .vle_keysize = 0
981         };
982         memset(e_rec(entry), 0, recsize);
983         *(char *)e_rec(entry) = recsize;
984 }
985
986 int iam_lvar_create(struct inode *obj,
987                     int keysize, int ptrsize, int recsize, handle_t *handle)
988 {
989         struct buffer_head *root_node;
990         struct buffer_head *leaf_node;
991         struct super_block *sb;
992
993         u32 blknr;
994         int result = 0;
995         unsigned long bsize;
996
997         assert_corr(obj->i_size == 0);
998
999         sb = obj->i_sb;
1000         bsize = sb->s_blocksize;
1001         root_node = osd_ldiskfs_append(handle, obj, &blknr);
1002         if (IS_ERR(root_node))
1003                 GOTO(out, result = PTR_ERR(root_node));
1004
1005         leaf_node = osd_ldiskfs_append(handle, obj, &blknr);
1006         if (IS_ERR(leaf_node))
1007                 GOTO(out_root, result = PTR_ERR(leaf_node));
1008
1009         lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
1010         lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
1011         ldiskfs_mark_inode_dirty(handle, obj);
1012         result = ldiskfs_handle_dirty_metadata(handle, NULL, root_node);
1013         if (result == 0)
1014                 result = ldiskfs_handle_dirty_metadata(handle, NULL, leaf_node);
1015         if (result != 0)
1016                 ldiskfs_std_error(sb, result);
1017
1018         brelse(leaf_node);
1019
1020         GOTO(out_root, result);
1021
1022 out_root:
1023         brelse(root_node);
1024 out:
1025         return result;
1026 }
1027
1028 static struct iam_operations lvar_ops = {
1029         .id_root_ptr    = lvar_root_ptr,
1030         .id_node_read   = iam_node_read,
1031         .id_node_init   = lvar_node_init,
1032         .id_node_check  = lvar_node_check,
1033         .id_node_load   = lvar_node_load,
1034         .id_ikeycmp     = lvar_ikeycmp,
1035         .id_root_inc    = lvar_root_inc,
1036         .id_ipd_alloc   = lvar_ipd_alloc,
1037         .id_ipd_free    = iam_ipd_free,
1038         .id_name        = "lvar"
1039 };
1040
1041 static int lvar_guess(struct iam_container *c)
1042 {
1043         int result;
1044         struct buffer_head *bh;
1045         const struct lvar_root *root;
1046
1047         assert_corr(c->ic_object != NULL);
1048
1049         result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
1050         if (result == 0) {
1051                 root = (void *)bh->b_data;
1052
1053                 if (le32_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
1054                         struct iam_descr *descr;
1055
1056                         descr = c->ic_descr;
1057                         descr->id_key_size  = LDISKFS_NAME_LEN;
1058                         descr->id_ikey_size = sizeof(lvar_hash_t);
1059                         descr->id_rec_size  = le16_to_cpu(root->vr_recsize);
1060                         descr->id_ptr_size  = le16_to_cpu(root->vr_ptrsize);
1061                         descr->id_root_gap  = sizeof(*root);
1062                         descr->id_node_gap  = 0;
1063                         descr->id_ops       = &lvar_ops;
1064                         descr->id_leaf_ops  = &lvar_leaf_ops;
1065                         c->ic_root_bh = bh;
1066                 } else {
1067                         result = -EBADF;
1068                         brelse(bh);
1069                 }
1070         }
1071         return result;
1072 }
1073
1074 static struct iam_format lvar_format = {
1075         .if_guess = lvar_guess
1076 };
1077
1078 void iam_lvar_format_init(void)
1079 {
1080         iam_format_register(&lvar_format);
1081 }
1082