Whamcloud - gitweb
LU-14286 osd-ldiskfs: fallocate() should zero new blocks
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_iam_lvar.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2015, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  * Lustre is a trademark of Sun Microsystems, Inc.
31  *
32  * iam_lvar.c
33  *
34  * implementation of iam format for fixed size records, variable sized keys.
35  *
36  * Author: Nikita Danilov <nikita@clusterfs.com>
37  */
38
39 #include <linux/types.h>
40 #include "osd_internal.h"
41
42 /*
43  * Leaf operations.
44  */
45
46 enum {
47         /* This is duplicated in lustre/utils/create_iam.c */
48         IAM_LVAR_LEAF_MAGIC = 0x1973
49 };
50
51 /* This is duplicated in lustre/utils/create_iam.c */
52 struct lvar_leaf_header {
53         __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
54         __le16 vlh_used;  /* used bytes, including header */
55 };
56
57 /*
58  * Format of leaf entry:
59  *
60  * __le16 keysize
61  *     u8 key[keysize]
62  *     u8 record[rec_size]
63  *
64  * Entries are ordered in key order.
65  */
66
67 /* This is duplicated in lustre/utils/create_iam.c */
68 typedef u32 lvar_hash_t;
69
70 /* This is duplicated in lustre/utils/create_iam.c */
71 struct lvar_leaf_entry {
72         __le32 vle_hash;
73         __le16 vle_keysize;
74         u8 vle_key[0];
75 };
76
77 #define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
78
79
80 static inline int blocksize(const struct iam_leaf *leaf)
81 {
82         return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
83 }
84
85 static inline const char *kchar(const struct iam_key *key)
86 {
87         return (void *)key;
88 }
89
90 static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
91 {
92         return (struct iam_lentry *)ent;
93 }
94
95 static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
96 {
97         return (struct lvar_leaf_entry *)lent;
98 }
99
100
101 static inline int e_keysize(const struct lvar_leaf_entry *ent)
102 {
103         return le16_to_cpu(ent->vle_keysize);
104 }
105
106 /* This is duplicated in lustre/utils/create_iam.c */
107 enum {
108         LVAR_PAD   = 4,
109         LVAR_ROUND = LVAR_PAD - 1
110 };
111
112 static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
113 {
114         BUILD_BUG_ON((LVAR_PAD & (LVAR_PAD - 1)));
115
116         return (offsetof(struct lvar_leaf_entry, vle_key) +
117                         namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
118 }
119
120 static inline int rec_size(const struct iam_rec *rec)
121 {
122         return *(const char *)rec;
123 }
124
125 static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
126 {
127         return ((void *)ent) +
128                 offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
129 }
130
131 static inline int e_size(const struct iam_leaf *leaf,
132                          const struct lvar_leaf_entry *ent)
133 {
134         return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
135 }
136
137 static inline char *e_char(const struct lvar_leaf_entry *ent)
138 {
139         return (char *)&ent->vle_key;
140 }
141
142 static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
143 {
144         return (struct iam_key *)e_char(ent);
145 }
146
147 static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
148 {
149         return le32_to_cpu(ent->vle_hash);
150 }
151
152 static void e_print(const struct lvar_leaf_entry *ent)
153 {
154         CERROR("        %p %8.8x \"%*.*s\"\n", ent, e_hash(ent),
155                         e_keysize(ent), e_keysize(ent), e_char(ent));
156 }
157
158 static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
159                                              const struct lvar_leaf_entry *ent)
160 {
161         return ((void *)ent) + e_size(leaf, ent);
162 }
163
164 #define LVAR_HASH_SANDWICH  (0)
165 #define LVAR_HASH_TEA       (1)
166 #define LVAR_HASH_R5        (0)
167 #define LVAR_HASH_PREFIX    (0)
168
169 #ifdef HAVE_LDISKFSFS_GETHASH_INODE_ARG
170 /*
171  * NOTE: doing this breaks on file systems configured with
172  *       case-insensitive file name lookups
173  *
174  * kernel 5.2 commit b886ee3e778ec2ad43e276fd378ab492cf6819b7
175  * ext4: Support case-insensitive file name lookups
176  *
177  * FUTURE:
178  *  We need to pass the struct inode *dir down to hash_build0
179  *  to enable case-insensitive file name support ext4/ldiskfs
180  */
181 #define e_ldiskfsfs_dirhash(name, len, info) \
182                 __ldiskfsfs_dirhash(name, len, info)
183 #else
184 #define e_ldiskfsfs_dirhash(name, len, info) \
185                 ldiskfsfs_dirhash(name, len, info)
186 #endif
187
188 static u32 hash_build0(const char *name, int namelen)
189 {
190         u32 result;
191
192         if (namelen == 0)
193                 return 0;
194         if (strncmp(name, ".", 1) == 0 && namelen == 1)
195                 return 1;
196         if (strncmp(name, "..", 2) == 0 && namelen == 2)
197                 return 2;
198
199         if (LVAR_HASH_PREFIX) {
200                 result = 0;
201                 strncpy((void *)&result,
202                         name, min_t(int, namelen, sizeof(result)));
203         } else {
204                 struct ldiskfs_dx_hash_info hinfo;
205
206                 hinfo.hash_version = LDISKFS_DX_HASH_TEA;
207                 hinfo.seed = NULL;
208                 e_ldiskfsfs_dirhash(name, namelen, &hinfo);
209                 result = hinfo.hash;
210                 if (LVAR_HASH_SANDWICH) {
211                         u32 result2;
212
213                         hinfo.hash_version = LDISKFS_DX_HASH_TEA;
214                         hinfo.seed = NULL;
215                         e_ldiskfsfs_dirhash(name, namelen, &hinfo);
216                         result2 = hinfo.hash;
217                         result = (0xfc000000 & result2) | (0x03ffffff & result);
218                 }
219         }
220         return result;
221 }
222
223 enum {
224         HASH_GRAY_AREA = 1024,
225         HASH_MAX_SIZE  = 0x7fffffffUL
226 };
227
228 static u32 hash_build(const char *name, int namelen)
229 {
230         u32 hash;
231
232         hash = (hash_build0(name, namelen) << 1) & HASH_MAX_SIZE;
233         if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA)
234                 hash &= HASH_GRAY_AREA - 1;
235         return hash;
236 }
237
238 static inline lvar_hash_t get_hash(const struct iam_container *bag,
239                                    const char *name, int namelen)
240 {
241         return hash_build(name, namelen);
242 }
243
244 static inline int e_eq(const struct lvar_leaf_entry *ent,
245                        const char *name, int namelen)
246 {
247         return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
248 }
249
250 static inline int e_cmp(const struct iam_leaf *leaf,
251                         const struct lvar_leaf_entry *ent, lvar_hash_t hash)
252 {
253         lvar_hash_t ehash;
254
255         ehash = e_hash(ent);
256         return ehash == hash ? 0 : (ehash < hash ? -1 : 1);
257 }
258
259 static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
260 {
261         return (struct lvar_leaf_header *)l->il_bh->b_data;
262 }
263
264 static int h_used(const struct lvar_leaf_header *hdr)
265 {
266         return le16_to_cpu(hdr->vlh_used);
267 }
268
269 static void h_used_adj(const struct iam_leaf *leaf,
270                        struct lvar_leaf_header *hdr, int adj)
271 {
272         int used;
273
274         used = h_used(hdr) + adj;
275         assert_corr(sizeof(*hdr) <= used && used <= blocksize(leaf));
276         hdr->vlh_used = cpu_to_le16(used);
277 }
278
279 static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
280 {
281         return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
282 }
283
284 static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
285 {
286         return (void *)l->il_bh->b_data + h_used(n_head(l));
287 }
288
289 static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
290 {
291         return lentry_lvar(l->il_at);
292 }
293
294 void n_print(const struct iam_leaf *l)
295 {
296         struct lvar_leaf_entry *scan;
297
298         CERROR("used: %d\n", h_used(n_head(l)));
299         for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan))
300                 e_print(scan);
301 }
302
303 #if LDISKFS_CORRECTNESS_ON
304 static int n_at_rec(const struct iam_leaf *folio)
305 {
306         return n_start(folio) <= lentry_lvar(folio->il_at) &&
307                 lentry_lvar(folio->il_at) < n_end(folio);
308 }
309
310 #if LDISKFS_INVARIANT_ON
311 static int n_invariant(const struct iam_leaf *leaf)
312 {
313         struct iam_path *path;
314         struct lvar_leaf_entry *scan;
315         struct lvar_leaf_entry *end;
316         lvar_hash_t hash;
317         lvar_hash_t nexthash;
318         lvar_hash_t starthash;
319
320         end  = n_end(leaf);
321         hash = 0;
322         path = leaf->il_path;
323
324         if (h_used(n_head(leaf)) > blocksize(leaf))
325                 return 0;
326
327         /*
328          * Delimiting key in the parent index node. Clear least bit to account
329          * for hash collision marker.
330          */
331         starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
332         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
333                 nexthash = e_hash(scan);
334                 if (nexthash != get_hash(iam_leaf_container(leaf),
335                                         e_char(scan), e_keysize(scan))) {
336                         BREAKPOINT();
337                         return 0;
338                 }
339                 if (0 && nexthash < starthash) {
340                         /*
341                          * Unfortunately this useful invariant cannot be
342                          * reliably checked as parent node is not necessarily
343                          * locked.
344                          */
345                         n_print(leaf);
346                         CERROR("%#x < %#x\n", nexthash, starthash);
347                         dump_stack();
348                         return 0;
349                 }
350                 if (nexthash < hash) {
351                         BREAKPOINT();
352                         return 0;
353                 }
354                 hash = nexthash;
355         }
356         if (scan != end) {
357                 BREAKPOINT();
358                 return 0;
359         }
360         return 1;
361 }
362 /* LDISKFS_INVARIANT_ON */
363 #endif
364
365 /* LDISKFS_CORRECTNESS_ON */
366 #endif
367
368 static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
369                                   struct iam_ikey *key)
370 {
371         lvar_hash_t *hash;
372
373         assert_corr(n_at_rec(l));
374
375         hash = (void *)key;
376         *hash = e_hash(n_cur(l));
377         return key;
378 }
379
380 static struct iam_key *lvar_key(const struct iam_leaf *l)
381 {
382         return e_key(n_cur(l));
383 }
384
385 static int lvar_key_size(const struct iam_leaf *l)
386 {
387         return e_keysize(n_cur(l));
388 }
389
390 static void lvar_start(struct iam_leaf *l)
391 {
392         l->il_at = lvar_lentry(n_start(l));
393 }
394
395 static int lvar_init(struct iam_leaf *l)
396 {
397         int result;
398         int used;
399         struct lvar_leaf_header *head;
400
401         assert_corr(l->il_bh != NULL);
402
403         head = n_head(l);
404         used = h_used(head);
405         if (le16_to_cpu(head->vlh_magic) == IAM_LVAR_LEAF_MAGIC &&
406                         used <= blocksize(l)) {
407                 l->il_at = l->il_entries = lvar_lentry(n_start(l));
408                 result = 0;
409         } else {
410                 struct inode *obj;
411
412                 obj = iam_leaf_container(l)->ic_object;
413                 CERROR(
414                 "Bad magic in node %llu (#%lu): %#x != %#x or wrong used: %d\n",
415                 (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
416                 le16_to_cpu(head->vlh_magic), IAM_LVAR_LEAF_MAGIC,
417                 used);
418                 result = -EIO;
419         }
420         return result;
421 }
422
423 static void lvar_fini(struct iam_leaf *l)
424 {
425         l->il_entries = l->il_at = NULL;
426 }
427
428 static struct iam_rec *lvar_rec(const struct iam_leaf *l)
429 {
430         assert_corr(n_at_rec(l));
431         return e_rec(n_cur(l));
432 }
433
434 static void lvar_next(struct iam_leaf *l)
435 {
436         assert_corr(n_at_rec(l));
437         assert_corr(iam_leaf_is_locked(l));
438         l->il_at = lvar_lentry(e_next(l, n_cur(l)));
439 }
440
441 static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
442 {
443         struct lvar_leaf_entry *found;
444         struct lvar_leaf_entry *scan;
445         struct lvar_leaf_entry *end;
446         int result;
447         const char *name;
448         int namelen;
449         int found_equal;
450         lvar_hash_t hash;
451         int last;
452
453         assert_inv(n_invariant(leaf));
454         end = n_end(leaf);
455
456         name = kchar(k);
457         namelen = strlen(name);
458         hash = get_hash(iam_leaf_container(leaf), name, namelen);
459         found = NULL;
460         found_equal = 0;
461         last = 1;
462
463         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
464                 lvar_hash_t scan_hash;
465
466                 scan_hash = e_hash(scan);
467                 if (scan_hash < hash)
468                         found = scan;
469                 else if (scan_hash == hash) {
470                         if (e_eq(scan, name, namelen)) {
471                                 /*
472                                  * perfect match
473                                  */
474                                 leaf->il_at = lvar_lentry(scan);
475                                 return IAM_LOOKUP_EXACT;
476                         } else if (!found_equal) {
477                                 found = scan;
478                                 found_equal = 1;
479                         }
480                 } else {
481                         last = 0;
482                         break;
483                 }
484         }
485         if (found == NULL) {
486                 /*
487                  * @k is less than all hashes in the leaf.
488                  */
489                 lvar_start(leaf);
490                 result = IAM_LOOKUP_BEFORE;
491         } else {
492                 leaf->il_at = lvar_lentry(found);
493                 result = IAM_LOOKUP_OK;
494                 assert_corr(n_at_rec(leaf));
495         }
496         if (last)
497                 result |= IAM_LOOKUP_LAST;
498         assert_inv(n_invariant(leaf));
499
500         return result;
501 }
502
503 static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
504 {
505         struct lvar_leaf_entry *scan;
506         struct lvar_leaf_entry *end;
507         lvar_hash_t hash;
508
509         assert_inv(n_invariant(leaf));
510         end  = n_end(leaf);
511         hash = *(const lvar_hash_t *)ik;
512
513         lvar_start(leaf);
514         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
515                 lvar_hash_t scan_hash;
516
517                 scan_hash = e_hash(scan);
518                 if (scan_hash > hash)
519                         return scan == n_start(leaf) ?
520                                 IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
521                 leaf->il_at = lvar_lentry(scan);
522                 if (scan_hash == hash)
523                         return IAM_LOOKUP_EXACT;
524         }
525         assert_inv(n_invariant(leaf));
526         /*
527          * @ik is greater than any key in the node. Return last record in the
528          * node.
529          */
530         return IAM_LOOKUP_OK;
531 }
532
533 static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
534 {
535         memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
536 }
537
538 static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
539 {
540         assert_corr(n_at_rec(l));
541         assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
542         assert_corr(iam_leaf_is_locked(l));
543         __lvar_key_set(l, k);
544         assert_inv(n_invariant(l));
545 }
546
547 static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
548 {
549         lvar_hash_t hash;
550         const char *name;
551
552         name = kchar(k);
553
554         hash = get_hash(iam_leaf_container(l), name, strlen(name));
555         return e_cmp(l, n_cur(l), hash);
556 }
557
558 static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
559 {
560         const char *name;
561
562         name = kchar(k);
563         return e_eq(n_cur(l), name, strlen(name));
564 }
565
566 static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
567 {
568         memcpy(e_rec(n_cur(l)), r, rec_size(r));
569 }
570
571 static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
572 {
573         assert_corr(n_at_rec(l));
574         assert_corr(iam_leaf_is_locked(l));
575         __lvar_rec_set(l, r);
576         assert_inv(n_invariant(l));
577 }
578
579 static int lvar_rec_eq(const struct iam_leaf *l, const struct iam_rec *r)
580 {
581         struct iam_rec *rec = e_rec(n_cur(l));
582
583         if (rec_size(rec) != rec_size(r))
584                 return 0;
585         return !memcmp(rec, r, rec_size(r));
586 }
587
588 static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
589 {
590         struct iam_rec *rec;
591
592         rec = e_rec(n_cur(l));
593         assert_corr(n_at_rec(l));
594         assert_corr(iam_leaf_is_locked(l));
595         memcpy(r, rec, rec_size(rec));
596         assert_inv(n_invariant(l));
597 }
598
599 static int lvar_can_add(const struct iam_leaf *l,
600                         const struct iam_key *k, const struct iam_rec *r)
601 {
602         assert_corr(iam_leaf_is_locked(l));
603         return h_used(n_head(l)) +
604                 getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
605 }
606
607 static int lvar_at_end(const struct iam_leaf *folio)
608 {
609         assert_corr(iam_leaf_is_locked(folio));
610         return n_cur(folio) == n_end(folio);
611 }
612
613 static void lvar_rec_add(struct iam_leaf *leaf,
614                          const struct iam_key *k, const struct iam_rec *r)
615 {
616         const char *key;
617         int ksize;
618         int shift;
619         void *end;
620         void *start;
621         ptrdiff_t diff;
622
623         assert_corr(lvar_can_add(leaf, k, r));
624         assert_inv(n_invariant(leaf));
625         assert_corr(iam_leaf_is_locked(leaf));
626
627         key   = kchar(k);
628         ksize = strlen(key);
629         shift = getsize(leaf, ksize, rec_size(r));
630
631         if (!lvar_at_end(leaf)) {
632                 assert_corr(n_cur(leaf) < n_end(leaf));
633                 end = n_end(leaf);
634                 if (lvar_key_cmp(leaf, k) <= 0)
635                         lvar_next(leaf);
636                 else
637                         /*
638                          * Another exceptional case: insertion with the key
639                          * less than least key in the leaf.
640                          */
641                         assert_corr(leaf->il_at == leaf->il_entries);
642
643                 start = leaf->il_at;
644                 diff  = PDIFF(end, start);
645                 assert_corr(diff >= 0);
646                 memmove(start + shift, start, diff);
647         }
648         h_used_adj(leaf, n_head(leaf), shift);
649         n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
650         n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf),
651                                             key, ksize));
652         __lvar_key_set(leaf, k);
653         __lvar_rec_set(leaf, r);
654         assert_corr(n_at_rec(leaf));
655         assert_inv(n_invariant(leaf));
656 }
657
658 static void lvar_rec_del(struct iam_leaf *leaf, int shift)
659 {
660         void *next;
661         void *end;
662         int nob;
663
664         assert_corr(n_at_rec(leaf));
665         assert_inv(n_invariant(leaf));
666         assert_corr(iam_leaf_is_locked(leaf));
667
668         end  = n_end(leaf);
669         next = e_next(leaf, n_cur(leaf));
670         nob  = e_size(leaf, n_cur(leaf));
671         memmove(leaf->il_at, next, end - next);
672         h_used_adj(leaf, n_head(leaf), -nob);
673         assert_inv(n_invariant(leaf));
674 }
675
676 static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
677 {
678         struct lvar_leaf_header *hdr;
679
680         hdr = (struct lvar_leaf_header *)bh->b_data;
681         hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
682         hdr->vlh_used  = sizeof(*hdr);
683 }
684
685 static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
686                                           struct lvar_leaf_entry **prev)
687 {
688         void *scan;
689         void *start;
690         int threshold;
691
692         *prev = NULL;
693         threshold = blocksize(leaf) / 2;
694         for (scan = start = n_start(leaf); scan - start <= threshold;
695                         *prev = scan, scan = e_next(leaf, scan)) {
696                 ;
697         }
698         return scan;
699 }
700
701 static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
702                        iam_ptr_t new_blknr)
703 {
704         struct lvar_leaf_entry *first_to_move;
705         struct lvar_leaf_entry *last_to_stay;
706         struct iam_path *path;
707         struct lvar_leaf_header *hdr;
708         struct buffer_head *new_leaf;
709         ptrdiff_t tomove;
710         lvar_hash_t hash;
711
712         assert_inv(n_invariant(leaf));
713         assert_corr(iam_leaf_is_locked(leaf));
714
715         new_leaf = *bh;
716         path = iam_leaf_path(leaf);
717
718         hdr = (void *)new_leaf->b_data;
719
720         first_to_move = find_pivot(leaf, &last_to_stay);
721         assert_corr(last_to_stay != NULL);
722         assert_corr(e_next(leaf, last_to_stay) == first_to_move);
723
724         hash = e_hash(first_to_move);
725         if (hash == e_hash(last_to_stay))
726                 /*
727                  * Duplicate hash.
728                  */
729                 hash |= 1;
730
731         tomove = PDIFF(n_end(leaf), first_to_move);
732         memmove(hdr + 1, first_to_move, tomove);
733
734         h_used_adj(leaf, hdr, tomove);
735         h_used_adj(leaf, n_head(leaf), -tomove);
736
737         assert_corr(n_end(leaf) == first_to_move);
738
739         if (n_cur(leaf) >= first_to_move) {
740                 /*
741                  * insertion point moves into new leaf.
742                  */
743                 ptrdiff_t shift;
744
745                 shift = PDIFF(leaf->il_at, first_to_move);
746                 *bh = leaf->il_bh;
747                 leaf->il_bh = new_leaf;
748                 leaf->il_curidx = new_blknr;
749
750                 assert_corr(iam_leaf_is_locked(leaf));
751                 lvar_init(leaf);
752                 /*
753                  * init cannot fail, as node was just initialized.
754                  */
755                 assert_corr(result == 0);
756                 leaf->il_at = ((void *)leaf->il_at) + shift;
757         }
758         /*
759          * Insert pointer to the new node (together with the least key in
760          * the node) into index node.
761          */
762         iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
763                             new_blknr);
764         assert_corr(n_cur(leaf) < n_end(leaf));
765         assert_inv(n_invariant(leaf));
766 }
767
768 static int lvar_leaf_empty(struct iam_leaf *leaf)
769 {
770         return h_used(n_head(leaf)) == sizeof(struct lvar_leaf_header);
771 }
772
773 static struct iam_leaf_operations lvar_leaf_ops = {
774         .init           = lvar_init,
775         .init_new       = lvar_init_new,
776         .fini           = lvar_fini,
777         .start          = lvar_start,
778         .next           = lvar_next,
779         .key            = lvar_key,
780         .ikey           = lvar_ikey,
781         .rec            = lvar_rec,
782         .key_set        = lvar_key_set,
783         .key_cmp        = lvar_key_cmp,
784         .key_eq         = lvar_key_eq,
785         .key_size       = lvar_key_size,
786         .rec_set        = lvar_rec_set,
787         .rec_eq         = lvar_rec_eq,
788         .rec_get        = lvar_rec_get,
789         .lookup         = lvar_lookup,
790         .ilookup        = lvar_ilookup,
791         .at_end         = lvar_at_end,
792         .rec_add        = lvar_rec_add,
793         .rec_del        = lvar_rec_del,
794         .can_add        = lvar_can_add,
795         .split          = lvar_split,
796         .leaf_empty     = lvar_leaf_empty,
797 };
798
799 /*
800  * Index operations.
801  */
802
803 enum {
804         /* This is duplicated in lustre/utils/create_iam.c */
805         /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
806         IAM_LVAR_ROOT_MAGIC = 0xb01dface
807 };
808
809 /* This is duplicated in lustre/utils/create_iam.c */
810 struct lvar_root {
811         __le32 vr_magic;
812         __le16 vr_recsize;
813         __le16 vr_ptrsize;
814         u8 vr_indirect_levels;
815         u8 vr_padding0;
816         __le16 vr_padding1;
817 };
818
819 static u32 lvar_root_ptr(struct iam_container *c)
820 {
821         return 0;
822 }
823
824 static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
825                           int root)
826 {
827         return 0;
828 }
829
830 static struct iam_entry *lvar_root_inc(struct iam_container *c,
831                                        struct iam_path *path,
832                                        struct iam_frame *frame)
833 {
834         struct lvar_root *root;
835         struct iam_entry *entries;
836
837         assert_corr(iam_frame_is_locked(path, frame));
838         entries = frame->entries;
839
840         dx_set_count(entries, 2);
841         assert_corr(dx_get_limit(entries) == dx_root_limit(path));
842
843         root = (void *)frame->bh->b_data;
844         assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
845         root->vr_indirect_levels++;
846         frame->at = entries = iam_entry_shift(path, entries, 1);
847         memset(iam_ikey_at(path, entries), 0,
848                iam_path_descr(path)->id_ikey_size);
849         return entries;
850 }
851
852 static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
853 {
854         unsigned int count;
855         unsigned int limit;
856         unsigned int limit_correct;
857         struct iam_entry *entries;
858
859         entries = dx_node_get_entries(path, frame);
860
861         if (frame == path->ip_frames) {
862                 struct lvar_root *root;
863
864                 root = (void *)frame->bh->b_data;
865                 if (le32_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
866                         return -EIO;
867                 limit_correct = dx_root_limit(path);
868         } else
869                 limit_correct = dx_node_limit(path);
870         count = dx_get_count(entries);
871         limit = dx_get_limit(entries);
872         if (count > limit)
873                 return -EIO;
874         if (limit != limit_correct)
875                 return -EIO;
876         return 0;
877 }
878
879 static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
880 {
881         struct iam_entry *entries;
882         void *data;
883
884         entries = dx_node_get_entries(path, frame);
885         data = frame->bh->b_data;
886
887         if (frame == path->ip_frames) {
888                 struct lvar_root *root;
889                 const char *name;
890
891                 root = data;
892                 name = kchar(path->ip_key_target);
893                 path->ip_indirect = root->vr_indirect_levels;
894                 if (path->ip_ikey_target == NULL) {
895                         path->ip_ikey_target = iam_path_ikey(path, 4);
896                         *(lvar_hash_t *)path->ip_ikey_target =
897                                 get_hash(path->ip_container, name,
898                                          strlen(name));
899                 }
900         }
901         frame->entries = frame->at = entries;
902         return 0;
903 }
904
905 static int lvar_ikeycmp(const struct iam_container *c,
906                         const struct iam_ikey *k1, const struct iam_ikey *k2)
907 {
908         lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
909         lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
910
911         return p1 > p2 ? 1 : (p1 < p2 ? -1 : 0);
912 }
913
914 static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
915                                              void *area)
916 {
917         return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
918 }
919
920 static void lvar_root(void *buf,
921                       int blocksize, int keysize, int ptrsize, int recsize)
922 {
923         struct lvar_root *root;
924         struct dx_countlimit *limit;
925         void *entry;
926         int isize;
927
928         isize = sizeof(lvar_hash_t) + ptrsize;
929         root = buf;
930         *root = (typeof(*root)) {
931                 .vr_magic            = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
932                 .vr_recsize          = cpu_to_le16(recsize),
933                 .vr_ptrsize          = cpu_to_le16(ptrsize),
934                 .vr_indirect_levels  = 0
935         };
936
937         limit = (void *)(root + 1);
938         *limit = (typeof(*limit)){
939                 /*
940                  * limit itself + one pointer to the leaf.
941                  */
942                 .count = cpu_to_le16(2),
943                 .limit = iam_root_limit(sizeof(struct lvar_root), blocksize,
944                                         sizeof(lvar_hash_t) + ptrsize)
945         };
946
947         /* To guarantee that the padding "keysize + ptrsize"
948          * covers the "dx_countlimit" and the "idle_blocks". */
949         LASSERT((keysize + ptrsize) >=
950                 (sizeof(struct dx_countlimit) + sizeof(u32)));
951
952         entry = (void *)(limit + 1);
953         /* Put "idle_blocks" just after the limit. There was padding after
954          * the limit, the "idle_blocks" re-uses part of the padding, so no
955          * compatibility issues with old layout.
956          */
957         *(u32 *)entry = 0;
958
959         /*
960          * Skip over @limit.
961          */
962         entry = (void *)(root + 1) + isize;
963
964         /*
965          * Entry format is <key> followed by <ptr>. In the minimal tree
966          * consisting of a root and single node, <key> is a minimal possible
967          * key.
968          */
969         *(lvar_hash_t *)entry = 0;
970         entry += sizeof(lvar_hash_t);
971         /* now @entry points to <ptr> */
972         if (ptrsize == 4)
973                 *(u_int32_t *)entry = cpu_to_le32(1);
974         else
975                 *(u_int64_t *)entry = cpu_to_le64(1);
976 }
977
978 static int lvar_esize(int namelen, int recsize)
979 {
980         return (offsetof(struct lvar_leaf_entry, vle_key) +
981                         namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
982 }
983
984 static void lvar_leaf(void *buf,
985                       int blocksize, int keysize, int ptrsize, int recsize)
986 {
987         struct lvar_leaf_header *head;
988         struct lvar_leaf_entry *entry;
989
990         /* form leaf */
991         head = buf;
992         *head = (typeof(*head)) {
993                 .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
994                 .vlh_used  = cpu_to_le16(sizeof(*head) + lvar_esize(0, recsize))
995         };
996         entry = (void *)(head + 1);
997         *entry = (typeof(*entry)) {
998                 .vle_hash    = 0,
999                 .vle_keysize = 0
1000         };
1001         memset(e_rec(entry), 0, recsize);
1002         *(char *)e_rec(entry) = recsize;
1003 }
1004
1005 int iam_lvar_create(struct inode *obj,
1006                     int keysize, int ptrsize, int recsize, handle_t *handle)
1007 {
1008         struct buffer_head *root_node;
1009         struct buffer_head *leaf_node;
1010         struct super_block *sb;
1011
1012         u32 blknr;
1013         int result = 0;
1014         unsigned long bsize;
1015
1016         assert_corr(obj->i_size == 0);
1017
1018         sb = obj->i_sb;
1019         bsize = sb->s_blocksize;
1020         root_node = osd_ldiskfs_append(handle, obj, &blknr);
1021         if (IS_ERR(root_node))
1022                 GOTO(out, result = PTR_ERR(root_node));
1023
1024         leaf_node = osd_ldiskfs_append(handle, obj, &blknr);
1025         if (IS_ERR(leaf_node))
1026                 GOTO(out_root, result = PTR_ERR(leaf_node));
1027
1028         lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
1029         lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
1030         ldiskfs_mark_inode_dirty(handle, obj);
1031         result = ldiskfs_handle_dirty_metadata(handle, NULL, root_node);
1032         if (result == 0)
1033                 result = ldiskfs_handle_dirty_metadata(handle, NULL, leaf_node);
1034         if (result != 0)
1035                 ldiskfs_std_error(sb, result);
1036
1037         brelse(leaf_node);
1038
1039         GOTO(out_root, result);
1040
1041 out_root:
1042         brelse(root_node);
1043 out:
1044         return result;
1045 }
1046
1047 static struct iam_operations lvar_ops = {
1048         .id_root_ptr    = lvar_root_ptr,
1049         .id_node_read   = iam_node_read,
1050         .id_node_init   = lvar_node_init,
1051         .id_node_check  = lvar_node_check,
1052         .id_node_load   = lvar_node_load,
1053         .id_ikeycmp     = lvar_ikeycmp,
1054         .id_root_inc    = lvar_root_inc,
1055         .id_ipd_alloc   = lvar_ipd_alloc,
1056         .id_ipd_free    = iam_ipd_free,
1057         .id_name        = "lvar"
1058 };
1059
1060 int iam_lvar_guess(struct iam_container *c)
1061 {
1062         int result;
1063         struct buffer_head *bh;
1064         const struct lvar_root *root;
1065
1066         assert_corr(c->ic_object != NULL);
1067
1068         result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
1069         if (result == 0) {
1070                 root = (void *)bh->b_data;
1071
1072                 if (le32_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
1073                         struct iam_descr *descr;
1074
1075                         descr = c->ic_descr;
1076                         descr->id_key_size  = LDISKFS_NAME_LEN;
1077                         descr->id_ikey_size = sizeof(lvar_hash_t);
1078                         descr->id_rec_size  = le16_to_cpu(root->vr_recsize);
1079                         descr->id_ptr_size  = le16_to_cpu(root->vr_ptrsize);
1080                         descr->id_root_gap  = sizeof(*root);
1081                         descr->id_node_gap  = 0;
1082                         descr->id_ops       = &lvar_ops;
1083                         descr->id_leaf_ops  = &lvar_leaf_ops;
1084                         c->ic_root_bh = bh;
1085                 } else {
1086                         result = -EBADF;
1087                         brelse(bh);
1088                 }
1089         }
1090         return result;
1091 }