Whamcloud - gitweb
LU-17871 ldlm: FLOCK ownlocks may be not set
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_iam_lvar.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2015, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * iam_lvar.c
32  *
33  * implementation of iam format for fixed size records, variable sized keys.
34  *
35  * Author: Nikita Danilov <nikita@clusterfs.com>
36  */
37
38 #include <linux/types.h>
39 #include "osd_internal.h"
40
41 /*
42  * Leaf operations.
43  */
44
45 enum {
46         /* This is duplicated in lustre/utils/create_iam.c */
47         IAM_LVAR_LEAF_MAGIC = 0x1973
48 };
49
50 /* This is duplicated in lustre/utils/create_iam.c */
51 struct lvar_leaf_header {
52         __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
53         __le16 vlh_used;  /* used bytes, including header */
54 };
55
56 /*
57  * Format of leaf entry:
58  *
59  * __le16 keysize
60  *     u8 key[keysize]
61  *     u8 record[rec_size]
62  *
63  * Entries are ordered in key order.
64  */
65
66 /* This is duplicated in lustre/utils/create_iam.c */
67 typedef u32 lvar_hash_t;
68
69 /* This is duplicated in lustre/utils/create_iam.c */
70 struct lvar_leaf_entry {
71         __le32 vle_hash;
72         __le16 vle_keysize;
73         u8 vle_key[0];
74 };
75
76 #define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
77
78
79 static inline int blocksize(const struct iam_leaf *leaf)
80 {
81         return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
82 }
83
84 static inline const char *kchar(const struct iam_key *key)
85 {
86         return (void *)key;
87 }
88
89 static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
90 {
91         return (struct iam_lentry *)ent;
92 }
93
94 static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
95 {
96         return (struct lvar_leaf_entry *)lent;
97 }
98
99
100 static inline int e_keysize(const struct lvar_leaf_entry *ent)
101 {
102         return le16_to_cpu(ent->vle_keysize);
103 }
104
105 /* This is duplicated in lustre/utils/create_iam.c */
106 enum {
107         LVAR_PAD   = 4,
108         LVAR_ROUND = LVAR_PAD - 1
109 };
110
111 static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
112 {
113         BUILD_BUG_ON((LVAR_PAD & (LVAR_PAD - 1)));
114
115         return (offsetof(struct lvar_leaf_entry, vle_key) +
116                         namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
117 }
118
119 static inline int rec_size(const struct iam_rec *rec)
120 {
121         return *(const char *)rec;
122 }
123
124 static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
125 {
126         return ((void *)ent) +
127                 offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
128 }
129
130 static inline int e_size(const struct iam_leaf *leaf,
131                          const struct lvar_leaf_entry *ent)
132 {
133         return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
134 }
135
136 static inline char *e_char(const struct lvar_leaf_entry *ent)
137 {
138         return (char *)&ent->vle_key;
139 }
140
141 static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
142 {
143         return (struct iam_key *)e_char(ent);
144 }
145
146 static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
147 {
148         return le32_to_cpu(ent->vle_hash);
149 }
150
151 static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
152                                              const struct lvar_leaf_entry *ent)
153 {
154         return ((void *)ent) + e_size(leaf, ent);
155 }
156
157 #define LVAR_HASH_SANDWICH  (0)
158 #define LVAR_HASH_TEA       (1)
159 #define LVAR_HASH_R5        (0)
160 #define LVAR_HASH_PREFIX    (0)
161
162 #ifdef HAVE_LDISKFSFS_DIRHASH_WITH_DIR
163 #define e_ldiskfsfs_dirhash(dir, name, len, info) \
164                 ldiskfsfs_dirhash((dir), (name), (len), (info))
165 #else
166 #define e_ldiskfsfs_dirhash(dir, name, len, info) \
167                 ldiskfsfs_dirhash((name), (len), (info))
168 #endif
169
170 static u32 hash_build0(const struct inode *dir, const char *name, int namelen)
171 {
172         u32 result;
173
174         if (namelen == 0)
175                 return 0;
176         if (strncmp(name, ".", 1) == 0 && namelen == 1)
177                 return 1;
178         if (strncmp(name, "..", 2) == 0 && namelen == 2)
179                 return 2;
180
181         if (LVAR_HASH_PREFIX) {
182                 result = 0;
183                 strncpy((void *)&result,
184                         name, min_t(int, namelen, sizeof(result)));
185         } else {
186                 struct ldiskfs_dx_hash_info hinfo;
187
188                 hinfo.hash_version = LDISKFS_DX_HASH_TEA;
189                 hinfo.seed = NULL;
190                 e_ldiskfsfs_dirhash(dir, name, namelen, &hinfo);
191                 result = hinfo.hash;
192                 if (LVAR_HASH_SANDWICH) {
193                         u32 result2;
194
195                         hinfo.hash_version = LDISKFS_DX_HASH_TEA;
196                         hinfo.seed = NULL;
197                         e_ldiskfsfs_dirhash(dir, name, namelen, &hinfo);
198                         result2 = hinfo.hash;
199                         result = (0xfc000000 & result2) | (0x03ffffff & result);
200                 }
201         }
202         return result;
203 }
204
205 enum {
206         HASH_GRAY_AREA = 1024,
207         HASH_MAX_SIZE  = 0x7fffffffUL
208 };
209
210 static u32 hash_build(const struct inode *dir, const char *name, int namelen)
211 {
212         u32 hash;
213
214         hash = (hash_build0(dir, name, namelen) << 1) & HASH_MAX_SIZE;
215         if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA)
216                 hash &= HASH_GRAY_AREA - 1;
217         return hash;
218 }
219
220 static inline lvar_hash_t get_hash(const struct inode *dir,
221                                    const char *name, int namelen)
222 {
223         return hash_build(dir, name, namelen);
224 }
225
226 static inline lvar_hash_t iam_get_hash(const struct iam_leaf *leaf,
227                                        const char *name, int namelen)
228 {
229         struct iam_path *iam_path = iam_leaf_path(leaf);
230
231         return get_hash(iam_path_obj(iam_path), name, namelen);
232 }
233
234 static inline int e_eq(const struct lvar_leaf_entry *ent,
235                        const char *name, int namelen)
236 {
237         return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
238 }
239
240 static inline int e_cmp(const struct iam_leaf *leaf,
241                         const struct lvar_leaf_entry *ent, lvar_hash_t hash)
242 {
243         lvar_hash_t ehash;
244
245         ehash = e_hash(ent);
246         return ehash == hash ? 0 : (ehash < hash ? -1 : 1);
247 }
248
249 static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
250 {
251         return (struct lvar_leaf_header *)l->il_bh->b_data;
252 }
253
254 static int h_used(const struct lvar_leaf_header *hdr)
255 {
256         return le16_to_cpu(hdr->vlh_used);
257 }
258
259 static void h_used_adj(const struct iam_leaf *leaf,
260                        struct lvar_leaf_header *hdr, int adj)
261 {
262         int used;
263
264         used = h_used(hdr) + adj;
265         assert_corr(sizeof(*hdr) <= used && used <= blocksize(leaf));
266         hdr->vlh_used = cpu_to_le16(used);
267 }
268
269 static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
270 {
271         return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
272 }
273
274 static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
275 {
276         return (void *)l->il_bh->b_data + h_used(n_head(l));
277 }
278
279 static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
280 {
281         return lentry_lvar(l->il_at);
282 }
283
284 #if LDISKFS_CORRECTNESS_ON
285 static int n_at_rec(const struct iam_leaf *folio)
286 {
287         return n_start(folio) <= lentry_lvar(folio->il_at) &&
288                 lentry_lvar(folio->il_at) < n_end(folio);
289 }
290
291 #if LDISKFS_INVARIANT_ON
292 static int n_invariant(const struct iam_leaf *leaf)
293 {
294         struct iam_path *path;
295         struct inode *dir;
296         struct lvar_leaf_entry *scan;
297         struct lvar_leaf_entry *end;
298         lvar_hash_t hash;
299         lvar_hash_t nexthash;
300         lvar_hash_t starthash;
301
302         end  = n_end(leaf);
303         hash = 0;
304         path = leaf->il_path;
305
306         if (h_used(n_head(leaf)) > blocksize(leaf))
307                 return 0;
308
309         dir = iam_path_obj(iam_path);
310         /*
311          * Delimiting key in the parent index node. Clear least bit to account
312          * for hash collision marker.
313          */
314         starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
315         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
316                 nexthash = e_hash(scan);
317                 if (nexthash != get_hash(dir, e_char(scan), e_keysize(scan))) {
318                         BREAKPOINT();
319                         return 0;
320                 }
321                 if (nexthash < hash) {
322                         BREAKPOINT();
323                         return 0;
324                 }
325                 hash = nexthash;
326         }
327         if (scan != end) {
328                 BREAKPOINT();
329                 return 0;
330         }
331         return 1;
332 }
333 /* LDISKFS_INVARIANT_ON */
334 #endif
335
336 /* LDISKFS_CORRECTNESS_ON */
337 #endif
338
339 static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
340                                   struct iam_ikey *key)
341 {
342         lvar_hash_t *hash;
343
344         assert_corr(n_at_rec(l));
345
346         hash = (void *)key;
347         *hash = e_hash(n_cur(l));
348         return key;
349 }
350
351 static struct iam_key *lvar_key(const struct iam_leaf *l)
352 {
353         return e_key(n_cur(l));
354 }
355
356 static int lvar_key_size(const struct iam_leaf *l)
357 {
358         return e_keysize(n_cur(l));
359 }
360
361 static void lvar_start(struct iam_leaf *l)
362 {
363         l->il_at = lvar_lentry(n_start(l));
364 }
365
366 static int lvar_init(struct iam_leaf *l)
367 {
368         int result;
369         int used;
370         struct lvar_leaf_header *head;
371
372         assert_corr(l->il_bh != NULL);
373
374         head = n_head(l);
375         used = h_used(head);
376         if (le16_to_cpu(head->vlh_magic) == IAM_LVAR_LEAF_MAGIC &&
377                         used <= blocksize(l)) {
378                 l->il_at = l->il_entries = lvar_lentry(n_start(l));
379                 result = 0;
380         } else {
381                 struct inode *obj;
382
383                 obj = iam_leaf_container(l)->ic_object;
384                 CERROR(
385                 "Bad magic in node %llu (#%lu): %#x != %#x or wrong used: %d\n",
386                 (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
387                 le16_to_cpu(head->vlh_magic), IAM_LVAR_LEAF_MAGIC,
388                 used);
389                 result = -EIO;
390         }
391         return result;
392 }
393
394 static void lvar_fini(struct iam_leaf *l)
395 {
396         l->il_entries = l->il_at = NULL;
397 }
398
399 static struct iam_rec *lvar_rec(const struct iam_leaf *l)
400 {
401         assert_corr(n_at_rec(l));
402         return e_rec(n_cur(l));
403 }
404
405 static void lvar_next(struct iam_leaf *l)
406 {
407         assert_corr(n_at_rec(l));
408         assert_corr(iam_leaf_is_locked(l));
409         l->il_at = lvar_lentry(e_next(l, n_cur(l)));
410 }
411
412 static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
413 {
414         struct lvar_leaf_entry *found;
415         struct lvar_leaf_entry *scan;
416         struct lvar_leaf_entry *end;
417         int result;
418         const char *name;
419         int namelen;
420         int found_equal;
421         lvar_hash_t hash;
422         int last;
423
424         assert_inv(n_invariant(leaf));
425         end = n_end(leaf);
426
427         name = kchar(k);
428         namelen = strlen(name);
429         hash = iam_get_hash(leaf, name, namelen);
430         found = NULL;
431         found_equal = 0;
432         last = 1;
433
434         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
435                 lvar_hash_t scan_hash;
436
437                 scan_hash = e_hash(scan);
438                 if (scan_hash < hash)
439                         found = scan;
440                 else if (scan_hash == hash) {
441                         if (e_eq(scan, name, namelen)) {
442                                 /*
443                                  * perfect match
444                                  */
445                                 leaf->il_at = lvar_lentry(scan);
446                                 return IAM_LOOKUP_EXACT;
447                         } else if (!found_equal) {
448                                 found = scan;
449                                 found_equal = 1;
450                         }
451                 } else {
452                         last = 0;
453                         break;
454                 }
455         }
456         if (found == NULL) {
457                 /*
458                  * @k is less than all hashes in the leaf.
459                  */
460                 lvar_start(leaf);
461                 result = IAM_LOOKUP_BEFORE;
462         } else {
463                 leaf->il_at = lvar_lentry(found);
464                 result = IAM_LOOKUP_OK;
465                 assert_corr(n_at_rec(leaf));
466         }
467         if (last)
468                 result |= IAM_LOOKUP_LAST;
469         assert_inv(n_invariant(leaf));
470
471         return result;
472 }
473
474 static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
475 {
476         struct lvar_leaf_entry *scan;
477         struct lvar_leaf_entry *end;
478         lvar_hash_t hash;
479
480         assert_inv(n_invariant(leaf));
481         end  = n_end(leaf);
482         hash = *(const lvar_hash_t *)ik;
483
484         lvar_start(leaf);
485         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
486                 lvar_hash_t scan_hash;
487
488                 scan_hash = e_hash(scan);
489                 if (scan_hash > hash)
490                         return scan == n_start(leaf) ?
491                                 IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
492                 leaf->il_at = lvar_lentry(scan);
493                 if (scan_hash == hash)
494                         return IAM_LOOKUP_EXACT;
495         }
496         assert_inv(n_invariant(leaf));
497         /*
498          * @ik is greater than any key in the node. Return last record in the
499          * node.
500          */
501         return IAM_LOOKUP_OK;
502 }
503
504 static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
505 {
506         memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
507 }
508
509 static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
510 {
511         assert_corr(n_at_rec(l));
512         assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
513         assert_corr(iam_leaf_is_locked(l));
514         __lvar_key_set(l, k);
515         assert_inv(n_invariant(l));
516 }
517
518 static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
519 {
520         lvar_hash_t hash;
521         const char *name;
522
523         name = kchar(k);
524
525         hash = iam_get_hash(l, name, strlen(name));
526         return e_cmp(l, n_cur(l), hash);
527 }
528
529 static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
530 {
531         const char *name;
532
533         name = kchar(k);
534         return e_eq(n_cur(l), name, strlen(name));
535 }
536
537 static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
538 {
539         memcpy(e_rec(n_cur(l)), r, rec_size(r));
540 }
541
542 static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
543 {
544         assert_corr(n_at_rec(l));
545         assert_corr(iam_leaf_is_locked(l));
546         __lvar_rec_set(l, r);
547         assert_inv(n_invariant(l));
548 }
549
550 static int lvar_rec_eq(const struct iam_leaf *l, const struct iam_rec *r)
551 {
552         struct iam_rec *rec = e_rec(n_cur(l));
553
554         if (rec_size(rec) != rec_size(r))
555                 return 0;
556         return !memcmp(rec, r, rec_size(r));
557 }
558
559 static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
560 {
561         struct iam_rec *rec;
562
563         rec = e_rec(n_cur(l));
564         assert_corr(n_at_rec(l));
565         assert_corr(iam_leaf_is_locked(l));
566         memcpy(r, rec, rec_size(rec));
567         assert_inv(n_invariant(l));
568 }
569
570 static int lvar_can_add(const struct iam_leaf *l,
571                         const struct iam_key *k, const struct iam_rec *r)
572 {
573         assert_corr(iam_leaf_is_locked(l));
574         return h_used(n_head(l)) +
575                 getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
576 }
577
578 static int lvar_at_end(const struct iam_leaf *folio)
579 {
580         assert_corr(iam_leaf_is_locked(folio));
581         return n_cur(folio) == n_end(folio);
582 }
583
584 static void lvar_rec_add(struct iam_leaf *leaf,
585                          const struct iam_key *k, const struct iam_rec *r)
586 {
587         const char *key;
588         int ksize;
589         int shift;
590         void *end;
591         void *start;
592         ptrdiff_t diff;
593
594         assert_corr(lvar_can_add(leaf, k, r));
595         assert_inv(n_invariant(leaf));
596         assert_corr(iam_leaf_is_locked(leaf));
597
598         key   = kchar(k);
599         ksize = strlen(key);
600         shift = getsize(leaf, ksize, rec_size(r));
601
602         if (!lvar_at_end(leaf)) {
603                 assert_corr(n_cur(leaf) < n_end(leaf));
604                 end = n_end(leaf);
605                 if (lvar_key_cmp(leaf, k) <= 0)
606                         lvar_next(leaf);
607                 else
608                         /*
609                          * Another exceptional case: insertion with the key
610                          * less than least key in the leaf.
611                          */
612                         assert_corr(leaf->il_at == leaf->il_entries);
613
614                 start = leaf->il_at;
615                 diff  = PDIFF(end, start);
616                 assert_corr(diff >= 0);
617                 memmove(start + shift, start, diff);
618         }
619         h_used_adj(leaf, n_head(leaf), shift);
620         n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
621         n_cur(leaf)->vle_hash = cpu_to_le32(iam_get_hash(leaf, key, ksize));
622         __lvar_key_set(leaf, k);
623         __lvar_rec_set(leaf, r);
624         assert_corr(n_at_rec(leaf));
625         assert_inv(n_invariant(leaf));
626 }
627
628 static void lvar_rec_del(struct iam_leaf *leaf, int shift)
629 {
630         void *next;
631         void *end;
632         int nob;
633
634         assert_corr(n_at_rec(leaf));
635         assert_inv(n_invariant(leaf));
636         assert_corr(iam_leaf_is_locked(leaf));
637
638         end  = n_end(leaf);
639         next = e_next(leaf, n_cur(leaf));
640         nob  = e_size(leaf, n_cur(leaf));
641         memmove(leaf->il_at, next, end - next);
642         h_used_adj(leaf, n_head(leaf), -nob);
643         assert_inv(n_invariant(leaf));
644 }
645
646 static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
647 {
648         struct lvar_leaf_header *hdr;
649
650         hdr = (struct lvar_leaf_header *)bh->b_data;
651         hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
652         hdr->vlh_used  = sizeof(*hdr);
653 }
654
655 static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
656                                           struct lvar_leaf_entry **prev)
657 {
658         void *scan;
659         void *start;
660         int threshold;
661
662         *prev = NULL;
663         threshold = blocksize(leaf) / 2;
664         for (scan = start = n_start(leaf); scan - start <= threshold;
665                         *prev = scan, scan = e_next(leaf, scan)) {
666                 ;
667         }
668         return scan;
669 }
670
671 static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
672                        iam_ptr_t new_blknr)
673 {
674         struct lvar_leaf_entry *first_to_move;
675         struct lvar_leaf_entry *last_to_stay;
676         struct iam_path *path;
677         struct lvar_leaf_header *hdr;
678         struct buffer_head *new_leaf;
679         ptrdiff_t tomove;
680         lvar_hash_t hash;
681
682         assert_inv(n_invariant(leaf));
683         assert_corr(iam_leaf_is_locked(leaf));
684
685         new_leaf = *bh;
686         path = iam_leaf_path(leaf);
687
688         hdr = (void *)new_leaf->b_data;
689
690         first_to_move = find_pivot(leaf, &last_to_stay);
691         assert_corr(last_to_stay != NULL);
692         assert_corr(e_next(leaf, last_to_stay) == first_to_move);
693
694         hash = e_hash(first_to_move);
695         if (hash == e_hash(last_to_stay))
696                 /*
697                  * Duplicate hash.
698                  */
699                 hash |= 1;
700
701         tomove = PDIFF(n_end(leaf), first_to_move);
702         memmove(hdr + 1, first_to_move, tomove);
703
704         h_used_adj(leaf, hdr, tomove);
705         h_used_adj(leaf, n_head(leaf), -tomove);
706
707         assert_corr(n_end(leaf) == first_to_move);
708
709         if (n_cur(leaf) >= first_to_move) {
710                 /*
711                  * insertion point moves into new leaf.
712                  */
713                 ptrdiff_t shift;
714
715                 shift = PDIFF(leaf->il_at, first_to_move);
716                 *bh = leaf->il_bh;
717                 leaf->il_bh = new_leaf;
718                 leaf->il_curidx = new_blknr;
719
720                 assert_corr(iam_leaf_is_locked(leaf));
721                 lvar_init(leaf);
722                 /*
723                  * init cannot fail, as node was just initialized.
724                  */
725                 assert_corr(result == 0);
726                 leaf->il_at = ((void *)leaf->il_at) + shift;
727         }
728         /*
729          * Insert pointer to the new node (together with the least key in
730          * the node) into index node.
731          */
732         iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
733                             new_blknr);
734         assert_corr(n_cur(leaf) < n_end(leaf));
735         assert_inv(n_invariant(leaf));
736 }
737
738 static int lvar_leaf_empty(struct iam_leaf *leaf)
739 {
740         return h_used(n_head(leaf)) == sizeof(struct lvar_leaf_header);
741 }
742
743 static const struct iam_leaf_operations lvar_leaf_ops = {
744         .init           = lvar_init,
745         .init_new       = lvar_init_new,
746         .fini           = lvar_fini,
747         .start          = lvar_start,
748         .next           = lvar_next,
749         .key            = lvar_key,
750         .ikey           = lvar_ikey,
751         .rec            = lvar_rec,
752         .key_set        = lvar_key_set,
753         .key_cmp        = lvar_key_cmp,
754         .key_eq         = lvar_key_eq,
755         .key_size       = lvar_key_size,
756         .rec_set        = lvar_rec_set,
757         .rec_eq         = lvar_rec_eq,
758         .rec_get        = lvar_rec_get,
759         .lookup         = lvar_lookup,
760         .ilookup        = lvar_ilookup,
761         .at_end         = lvar_at_end,
762         .rec_add        = lvar_rec_add,
763         .rec_del        = lvar_rec_del,
764         .can_add        = lvar_can_add,
765         .split          = lvar_split,
766         .leaf_empty     = lvar_leaf_empty,
767 };
768
769 /*
770  * Index operations.
771  */
772
773 enum {
774         /* This is duplicated in lustre/utils/create_iam.c */
775         /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
776         IAM_LVAR_ROOT_MAGIC = 0xb01dface
777 };
778
779 /* This is duplicated in lustre/utils/create_iam.c */
780 struct lvar_root {
781         __le32 vr_magic;
782         __le16 vr_recsize;
783         __le16 vr_ptrsize;
784         u8 vr_indirect_levels;
785         u8 vr_padding0;
786         __le16 vr_padding1;
787 };
788
789 static u32 lvar_root_ptr(struct iam_container *c)
790 {
791         return 0;
792 }
793
794 static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
795                           int root)
796 {
797         return 0;
798 }
799
800 static struct iam_entry *lvar_root_inc(struct iam_container *c,
801                                        struct iam_path *path,
802                                        struct iam_frame *frame)
803 {
804         struct lvar_root *root;
805         struct iam_entry *entries;
806
807         assert_corr(iam_frame_is_locked(path, frame));
808         entries = frame->entries;
809
810         dx_set_count(entries, 2);
811         assert_corr(dx_get_limit(entries) == dx_root_limit(path));
812
813         root = (void *)frame->bh->b_data;
814         assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
815         root->vr_indirect_levels++;
816         frame->at = entries = iam_entry_shift(path, entries, 1);
817         memset(iam_ikey_at(path, entries), 0,
818                iam_path_descr(path)->id_ikey_size);
819         return entries;
820 }
821
822 static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
823 {
824         unsigned int count;
825         unsigned int limit;
826         unsigned int limit_correct;
827         struct iam_entry *entries;
828
829         entries = dx_node_get_entries(path, frame);
830
831         if (frame == path->ip_frames) {
832                 struct lvar_root *root;
833
834                 root = (void *)frame->bh->b_data;
835                 if (le32_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
836                         return -EIO;
837                 limit_correct = dx_root_limit(path);
838         } else
839                 limit_correct = dx_node_limit(path);
840         count = dx_get_count(entries);
841         limit = dx_get_limit(entries);
842         if (count > limit)
843                 return -EIO;
844         if (limit != limit_correct)
845                 return -EIO;
846         return 0;
847 }
848
849 static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
850 {
851         struct iam_entry *entries;
852         void *data;
853
854         entries = dx_node_get_entries(path, frame);
855         data = frame->bh->b_data;
856
857         if (frame == path->ip_frames) {
858                 struct lvar_root *root;
859                 const char *name;
860
861                 root = data;
862                 name = kchar(path->ip_key_target);
863                 path->ip_indirect = root->vr_indirect_levels;
864                 if (path->ip_ikey_target == NULL) {
865                         path->ip_ikey_target = iam_path_ikey(path, 4);
866                         *(lvar_hash_t *)path->ip_ikey_target =
867                                 get_hash(iam_path_obj(path), name,
868                                          strlen(name));
869                 }
870         }
871         frame->entries = frame->at = entries;
872         return 0;
873 }
874
875 static int lvar_ikeycmp(const struct iam_container *c,
876                         const struct iam_ikey *k1, const struct iam_ikey *k2)
877 {
878         lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
879         lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
880
881         return p1 > p2 ? 1 : (p1 < p2 ? -1 : 0);
882 }
883
884 static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
885                                              void *area)
886 {
887         return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
888 }
889
890 static void lvar_root(void *buf,
891                       int blocksize, int keysize, int ptrsize, int recsize)
892 {
893         struct lvar_root *root;
894         struct dx_countlimit *limit;
895         void *entry;
896         int isize;
897
898         isize = sizeof(lvar_hash_t) + ptrsize;
899         root = buf;
900         *root = (typeof(*root)) {
901                 .vr_magic            = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
902                 .vr_recsize          = cpu_to_le16(recsize),
903                 .vr_ptrsize          = cpu_to_le16(ptrsize),
904                 .vr_indirect_levels  = 0
905         };
906
907         limit = (void *)(root + 1);
908         *limit = (typeof(*limit)){
909                 /*
910                  * limit itself + one pointer to the leaf.
911                  */
912                 .count = cpu_to_le16(2),
913                 .limit = iam_root_limit(sizeof(struct lvar_root), blocksize,
914                                         sizeof(lvar_hash_t) + ptrsize)
915         };
916
917         /* To guarantee that the padding "keysize + ptrsize"
918          * covers the "dx_countlimit" and the "idle_blocks". */
919         LASSERT((keysize + ptrsize) >=
920                 (sizeof(struct dx_countlimit) + sizeof(u32)));
921
922         entry = (void *)(limit + 1);
923         /* Put "idle_blocks" just after the limit. There was padding after
924          * the limit, the "idle_blocks" re-uses part of the padding, so no
925          * compatibility issues with old layout.
926          */
927         *(u32 *)entry = 0;
928
929         /*
930          * Skip over @limit.
931          */
932         entry = (void *)(root + 1) + isize;
933
934         /*
935          * Entry format is <key> followed by <ptr>. In the minimal tree
936          * consisting of a root and single node, <key> is a minimal possible
937          * key.
938          */
939         *(lvar_hash_t *)entry = 0;
940         entry += sizeof(lvar_hash_t);
941         /* now @entry points to <ptr> */
942         if (ptrsize == 4)
943                 *(u_int32_t *)entry = cpu_to_le32(1);
944         else
945                 *(u_int64_t *)entry = cpu_to_le64(1);
946 }
947
948 static int lvar_esize(int namelen, int recsize)
949 {
950         return (offsetof(struct lvar_leaf_entry, vle_key) +
951                         namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
952 }
953
954 static void lvar_leaf(void *buf,
955                       int blocksize, int keysize, int ptrsize, int recsize)
956 {
957         struct lvar_leaf_header *head;
958         struct lvar_leaf_entry *entry;
959
960         /* form leaf */
961         head = buf;
962         *head = (typeof(*head)) {
963                 .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
964                 .vlh_used  = cpu_to_le16(sizeof(*head) + lvar_esize(0, recsize))
965         };
966         entry = (void *)(head + 1);
967         *entry = (typeof(*entry)) {
968                 .vle_hash    = 0,
969                 .vle_keysize = 0
970         };
971         memset(e_rec(entry), 0, recsize);
972         *(char *)e_rec(entry) = recsize;
973 }
974
975 int iam_lvar_create(struct inode *obj,
976                     int keysize, int ptrsize, int recsize, handle_t *handle)
977 {
978         struct buffer_head *root_node;
979         struct buffer_head *leaf_node;
980         struct super_block *sb;
981
982         u32 blknr;
983         int result = 0;
984         unsigned long bsize;
985
986         assert_corr(obj->i_size == 0);
987
988         sb = obj->i_sb;
989         bsize = sb->s_blocksize;
990         root_node = osd_ldiskfs_append(handle, obj, &blknr);
991         if (IS_ERR(root_node))
992                 GOTO(out, result = PTR_ERR(root_node));
993
994         leaf_node = osd_ldiskfs_append(handle, obj, &blknr);
995         if (IS_ERR(leaf_node))
996                 GOTO(out_root, result = PTR_ERR(leaf_node));
997
998         lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
999         lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
1000         ldiskfs_mark_inode_dirty(handle, obj);
1001         result = ldiskfs_handle_dirty_metadata(handle, NULL, root_node);
1002         if (result == 0)
1003                 result = ldiskfs_handle_dirty_metadata(handle, NULL, leaf_node);
1004         if (result != 0)
1005                 ldiskfs_std_error(sb, result);
1006
1007         brelse(leaf_node);
1008
1009         GOTO(out_root, result);
1010
1011 out_root:
1012         brelse(root_node);
1013 out:
1014         return result;
1015 }
1016
1017 static const struct iam_operations lvar_ops = {
1018         .id_root_ptr    = lvar_root_ptr,
1019         .id_node_read   = iam_node_read,
1020         .id_node_init   = lvar_node_init,
1021         .id_node_check  = lvar_node_check,
1022         .id_node_load   = lvar_node_load,
1023         .id_ikeycmp     = lvar_ikeycmp,
1024         .id_root_inc    = lvar_root_inc,
1025         .id_ipd_alloc   = lvar_ipd_alloc,
1026         .id_ipd_free    = iam_ipd_free,
1027         .id_name        = "lvar"
1028 };
1029
1030 int iam_lvar_guess(struct iam_container *c)
1031 {
1032         int result;
1033         struct buffer_head *bh;
1034         const struct lvar_root *root;
1035
1036         assert_corr(c->ic_object != NULL);
1037
1038         result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
1039         if (result == 0) {
1040                 root = (void *)bh->b_data;
1041
1042                 if (le32_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
1043                         struct iam_descr *descr;
1044
1045                         descr = c->ic_descr;
1046                         descr->id_key_size  = LDISKFS_NAME_LEN;
1047                         descr->id_ikey_size = sizeof(lvar_hash_t);
1048                         descr->id_rec_size  = le16_to_cpu(root->vr_recsize);
1049                         descr->id_ptr_size  = le16_to_cpu(root->vr_ptrsize);
1050                         descr->id_root_gap  = sizeof(*root);
1051                         descr->id_node_gap  = 0;
1052                         descr->id_ops       = &lvar_ops;
1053                         descr->id_leaf_ops  = &lvar_leaf_ops;
1054                         c->ic_root_bh = bh;
1055                 } else {
1056                         result = -EBADF;
1057                         brelse(bh);
1058                 }
1059         }
1060         return result;
1061 }