Whamcloud - gitweb
LU-13124 scrub: check for multiple linked file
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_iam_lvar.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2015, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * iam_lvar.c
32  *
33  * implementation of iam format for fixed size records, variable sized keys.
34  *
35  * Author: Nikita Danilov <nikita@clusterfs.com>
36  */
37
38 #include <linux/types.h>
39 #include "osd_internal.h"
40
41 /*
42  * Leaf operations.
43  */
44
45 enum {
46         /* This is duplicated in lustre/utils/create_iam.c */
47         IAM_LVAR_LEAF_MAGIC = 0x1973
48 };
49
50 /* This is duplicated in lustre/utils/create_iam.c */
51 struct lvar_leaf_header {
52         __le16 vlh_magic; /* magic number IAM_LVAR_LEAF_MAGIC */
53         __le16 vlh_used;  /* used bytes, including header */
54 };
55
56 /*
57  * Format of leaf entry:
58  *
59  * __le16 keysize
60  *     u8 key[keysize]
61  *     u8 record[rec_size]
62  *
63  * Entries are ordered in key order.
64  */
65
66 /* This is duplicated in lustre/utils/create_iam.c */
67 typedef u32 lvar_hash_t;
68
69 /* This is duplicated in lustre/utils/create_iam.c */
70 struct lvar_leaf_entry {
71         __le32 vle_hash;
72         __le16 vle_keysize;
73         u8 vle_key[0];
74 };
75
76 #define PDIFF(ptr0, ptr1) (((char *)(ptr0)) - ((char *)(ptr1)))
77
78
79 static inline int blocksize(const struct iam_leaf *leaf)
80 {
81         return iam_leaf_container(leaf)->ic_object->i_sb->s_blocksize;
82 }
83
84 static inline const char *kchar(const struct iam_key *key)
85 {
86         return (void *)key;
87 }
88
89 static inline struct iam_lentry *lvar_lentry(const struct lvar_leaf_entry *ent)
90 {
91         return (struct iam_lentry *)ent;
92 }
93
94 static inline struct lvar_leaf_entry *lentry_lvar(const struct iam_lentry *lent)
95 {
96         return (struct lvar_leaf_entry *)lent;
97 }
98
99
100 static inline int e_keysize(const struct lvar_leaf_entry *ent)
101 {
102         return le16_to_cpu(ent->vle_keysize);
103 }
104
105 /* This is duplicated in lustre/utils/create_iam.c */
106 enum {
107         LVAR_PAD   = 4,
108         LVAR_ROUND = LVAR_PAD - 1
109 };
110
111 static inline int getsize(const struct iam_leaf *leaf, int namelen, int recsize)
112 {
113         BUILD_BUG_ON((LVAR_PAD & (LVAR_PAD - 1)));
114
115         return (offsetof(struct lvar_leaf_entry, vle_key) +
116                         namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
117 }
118
119 static inline int rec_size(const struct iam_rec *rec)
120 {
121         return *(const char *)rec;
122 }
123
124 static inline struct iam_rec *e_rec(const struct lvar_leaf_entry *ent)
125 {
126         return ((void *)ent) +
127                 offsetof(struct lvar_leaf_entry, vle_key) + e_keysize(ent);
128 }
129
130 static inline int e_size(const struct iam_leaf *leaf,
131                          const struct lvar_leaf_entry *ent)
132 {
133         return getsize(leaf, e_keysize(ent), rec_size(e_rec(ent)));
134 }
135
136 static inline char *e_char(const struct lvar_leaf_entry *ent)
137 {
138         return (char *)&ent->vle_key;
139 }
140
141 static inline struct iam_key *e_key(const struct lvar_leaf_entry *ent)
142 {
143         return (struct iam_key *)e_char(ent);
144 }
145
146 static inline lvar_hash_t e_hash(const struct lvar_leaf_entry *ent)
147 {
148         return le32_to_cpu(ent->vle_hash);
149 }
150
151 static void e_print(const struct lvar_leaf_entry *ent)
152 {
153         CERROR("        %p %8.8x \"%*.*s\"\n", ent, e_hash(ent),
154                         e_keysize(ent), e_keysize(ent), e_char(ent));
155 }
156
157 static inline struct lvar_leaf_entry *e_next(const struct iam_leaf *leaf,
158                                              const struct lvar_leaf_entry *ent)
159 {
160         return ((void *)ent) + e_size(leaf, ent);
161 }
162
163 #define LVAR_HASH_SANDWICH  (0)
164 #define LVAR_HASH_TEA       (1)
165 #define LVAR_HASH_R5        (0)
166 #define LVAR_HASH_PREFIX    (0)
167
168 #ifdef HAVE_LDISKFSFS_GETHASH_INODE_ARG
169 /*
170  * NOTE: doing this breaks on file systems configured with
171  *       case-insensitive file name lookups
172  *
173  * kernel 5.2 commit b886ee3e778ec2ad43e276fd378ab492cf6819b7
174  * ext4: Support case-insensitive file name lookups
175  *
176  * FUTURE:
177  *  We need to pass the struct inode *dir down to hash_build0
178  *  to enable case-insensitive file name support ext4/ldiskfs
179  */
180 #define e_ldiskfsfs_dirhash(name, len, info) \
181                 __ldiskfsfs_dirhash(name, len, info)
182 #else
183 #define e_ldiskfsfs_dirhash(name, len, info) \
184                 ldiskfsfs_dirhash(name, len, info)
185 #endif
186
187 static u32 hash_build0(const char *name, int namelen)
188 {
189         u32 result;
190
191         if (namelen == 0)
192                 return 0;
193         if (strncmp(name, ".", 1) == 0 && namelen == 1)
194                 return 1;
195         if (strncmp(name, "..", 2) == 0 && namelen == 2)
196                 return 2;
197
198         if (LVAR_HASH_PREFIX) {
199                 result = 0;
200                 strncpy((void *)&result,
201                         name, min_t(int, namelen, sizeof(result)));
202         } else {
203                 struct ldiskfs_dx_hash_info hinfo;
204
205                 hinfo.hash_version = LDISKFS_DX_HASH_TEA;
206                 hinfo.seed = NULL;
207                 e_ldiskfsfs_dirhash(name, namelen, &hinfo);
208                 result = hinfo.hash;
209                 if (LVAR_HASH_SANDWICH) {
210                         u32 result2;
211
212                         hinfo.hash_version = LDISKFS_DX_HASH_TEA;
213                         hinfo.seed = NULL;
214                         e_ldiskfsfs_dirhash(name, namelen, &hinfo);
215                         result2 = hinfo.hash;
216                         result = (0xfc000000 & result2) | (0x03ffffff & result);
217                 }
218         }
219         return result;
220 }
221
222 enum {
223         HASH_GRAY_AREA = 1024,
224         HASH_MAX_SIZE  = 0x7fffffffUL
225 };
226
227 static u32 hash_build(const char *name, int namelen)
228 {
229         u32 hash;
230
231         hash = (hash_build0(name, namelen) << 1) & HASH_MAX_SIZE;
232         if (hash > HASH_MAX_SIZE - HASH_GRAY_AREA)
233                 hash &= HASH_GRAY_AREA - 1;
234         return hash;
235 }
236
237 static inline lvar_hash_t get_hash(const struct iam_container *bag,
238                                    const char *name, int namelen)
239 {
240         return hash_build(name, namelen);
241 }
242
243 static inline int e_eq(const struct lvar_leaf_entry *ent,
244                        const char *name, int namelen)
245 {
246         return namelen == e_keysize(ent) && !memcmp(e_char(ent), name, namelen);
247 }
248
249 static inline int e_cmp(const struct iam_leaf *leaf,
250                         const struct lvar_leaf_entry *ent, lvar_hash_t hash)
251 {
252         lvar_hash_t ehash;
253
254         ehash = e_hash(ent);
255         return ehash == hash ? 0 : (ehash < hash ? -1 : 1);
256 }
257
258 static struct lvar_leaf_header *n_head(const struct iam_leaf *l)
259 {
260         return (struct lvar_leaf_header *)l->il_bh->b_data;
261 }
262
263 static int h_used(const struct lvar_leaf_header *hdr)
264 {
265         return le16_to_cpu(hdr->vlh_used);
266 }
267
268 static void h_used_adj(const struct iam_leaf *leaf,
269                        struct lvar_leaf_header *hdr, int adj)
270 {
271         int used;
272
273         used = h_used(hdr) + adj;
274         assert_corr(sizeof(*hdr) <= used && used <= blocksize(leaf));
275         hdr->vlh_used = cpu_to_le16(used);
276 }
277
278 static struct lvar_leaf_entry *n_start(const struct iam_leaf *leaf)
279 {
280         return (void *)leaf->il_bh->b_data + sizeof(struct lvar_leaf_header);
281 }
282
283 static struct lvar_leaf_entry *n_end(const struct iam_leaf *l)
284 {
285         return (void *)l->il_bh->b_data + h_used(n_head(l));
286 }
287
288 static struct lvar_leaf_entry *n_cur(const struct iam_leaf *l)
289 {
290         return lentry_lvar(l->il_at);
291 }
292
293 void n_print(const struct iam_leaf *l)
294 {
295         struct lvar_leaf_entry *scan;
296
297         CERROR("used: %d\n", h_used(n_head(l)));
298         for (scan = n_start(l); scan < n_end(l); scan = e_next(l, scan))
299                 e_print(scan);
300 }
301
302 #if LDISKFS_CORRECTNESS_ON
303 static int n_at_rec(const struct iam_leaf *folio)
304 {
305         return n_start(folio) <= lentry_lvar(folio->il_at) &&
306                 lentry_lvar(folio->il_at) < n_end(folio);
307 }
308
309 #if LDISKFS_INVARIANT_ON
310 static int n_invariant(const struct iam_leaf *leaf)
311 {
312         struct iam_path *path;
313         struct lvar_leaf_entry *scan;
314         struct lvar_leaf_entry *end;
315         lvar_hash_t hash;
316         lvar_hash_t nexthash;
317         lvar_hash_t starthash;
318
319         end  = n_end(leaf);
320         hash = 0;
321         path = leaf->il_path;
322
323         if (h_used(n_head(leaf)) > blocksize(leaf))
324                 return 0;
325
326         /*
327          * Delimiting key in the parent index node. Clear least bit to account
328          * for hash collision marker.
329          */
330         starthash = *(lvar_hash_t *)iam_ikey_at(path, path->ip_frame->at) & ~1;
331         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
332                 nexthash = e_hash(scan);
333                 if (nexthash != get_hash(iam_leaf_container(leaf),
334                                         e_char(scan), e_keysize(scan))) {
335                         BREAKPOINT();
336                         return 0;
337                 }
338                 if (0 && nexthash < starthash) {
339                         /*
340                          * Unfortunately this useful invariant cannot be
341                          * reliably checked as parent node is not necessarily
342                          * locked.
343                          */
344                         n_print(leaf);
345                         CERROR("%#x < %#x\n", nexthash, starthash);
346                         dump_stack();
347                         return 0;
348                 }
349                 if (nexthash < hash) {
350                         BREAKPOINT();
351                         return 0;
352                 }
353                 hash = nexthash;
354         }
355         if (scan != end) {
356                 BREAKPOINT();
357                 return 0;
358         }
359         return 1;
360 }
361 /* LDISKFS_INVARIANT_ON */
362 #endif
363
364 /* LDISKFS_CORRECTNESS_ON */
365 #endif
366
367 static struct iam_ikey *lvar_ikey(const struct iam_leaf *l,
368                                   struct iam_ikey *key)
369 {
370         lvar_hash_t *hash;
371
372         assert_corr(n_at_rec(l));
373
374         hash = (void *)key;
375         *hash = e_hash(n_cur(l));
376         return key;
377 }
378
379 static struct iam_key *lvar_key(const struct iam_leaf *l)
380 {
381         return e_key(n_cur(l));
382 }
383
384 static int lvar_key_size(const struct iam_leaf *l)
385 {
386         return e_keysize(n_cur(l));
387 }
388
389 static void lvar_start(struct iam_leaf *l)
390 {
391         l->il_at = lvar_lentry(n_start(l));
392 }
393
394 static int lvar_init(struct iam_leaf *l)
395 {
396         int result;
397         int used;
398         struct lvar_leaf_header *head;
399
400         assert_corr(l->il_bh != NULL);
401
402         head = n_head(l);
403         used = h_used(head);
404         if (le16_to_cpu(head->vlh_magic) == IAM_LVAR_LEAF_MAGIC &&
405                         used <= blocksize(l)) {
406                 l->il_at = l->il_entries = lvar_lentry(n_start(l));
407                 result = 0;
408         } else {
409                 struct inode *obj;
410
411                 obj = iam_leaf_container(l)->ic_object;
412                 CERROR(
413                 "Bad magic in node %llu (#%lu): %#x != %#x or wrong used: %d\n",
414                 (unsigned long long)l->il_bh->b_blocknr, obj->i_ino,
415                 le16_to_cpu(head->vlh_magic), IAM_LVAR_LEAF_MAGIC,
416                 used);
417                 result = -EIO;
418         }
419         return result;
420 }
421
422 static void lvar_fini(struct iam_leaf *l)
423 {
424         l->il_entries = l->il_at = NULL;
425 }
426
427 static struct iam_rec *lvar_rec(const struct iam_leaf *l)
428 {
429         assert_corr(n_at_rec(l));
430         return e_rec(n_cur(l));
431 }
432
433 static void lvar_next(struct iam_leaf *l)
434 {
435         assert_corr(n_at_rec(l));
436         assert_corr(iam_leaf_is_locked(l));
437         l->il_at = lvar_lentry(e_next(l, n_cur(l)));
438 }
439
440 static int lvar_lookup(struct iam_leaf *leaf, const struct iam_key *k)
441 {
442         struct lvar_leaf_entry *found;
443         struct lvar_leaf_entry *scan;
444         struct lvar_leaf_entry *end;
445         int result;
446         const char *name;
447         int namelen;
448         int found_equal;
449         lvar_hash_t hash;
450         int last;
451
452         assert_inv(n_invariant(leaf));
453         end = n_end(leaf);
454
455         name = kchar(k);
456         namelen = strlen(name);
457         hash = get_hash(iam_leaf_container(leaf), name, namelen);
458         found = NULL;
459         found_equal = 0;
460         last = 1;
461
462         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
463                 lvar_hash_t scan_hash;
464
465                 scan_hash = e_hash(scan);
466                 if (scan_hash < hash)
467                         found = scan;
468                 else if (scan_hash == hash) {
469                         if (e_eq(scan, name, namelen)) {
470                                 /*
471                                  * perfect match
472                                  */
473                                 leaf->il_at = lvar_lentry(scan);
474                                 return IAM_LOOKUP_EXACT;
475                         } else if (!found_equal) {
476                                 found = scan;
477                                 found_equal = 1;
478                         }
479                 } else {
480                         last = 0;
481                         break;
482                 }
483         }
484         if (found == NULL) {
485                 /*
486                  * @k is less than all hashes in the leaf.
487                  */
488                 lvar_start(leaf);
489                 result = IAM_LOOKUP_BEFORE;
490         } else {
491                 leaf->il_at = lvar_lentry(found);
492                 result = IAM_LOOKUP_OK;
493                 assert_corr(n_at_rec(leaf));
494         }
495         if (last)
496                 result |= IAM_LOOKUP_LAST;
497         assert_inv(n_invariant(leaf));
498
499         return result;
500 }
501
502 static int lvar_ilookup(struct iam_leaf *leaf, const struct iam_ikey *ik)
503 {
504         struct lvar_leaf_entry *scan;
505         struct lvar_leaf_entry *end;
506         lvar_hash_t hash;
507
508         assert_inv(n_invariant(leaf));
509         end  = n_end(leaf);
510         hash = *(const lvar_hash_t *)ik;
511
512         lvar_start(leaf);
513         for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) {
514                 lvar_hash_t scan_hash;
515
516                 scan_hash = e_hash(scan);
517                 if (scan_hash > hash)
518                         return scan == n_start(leaf) ?
519                                 IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK;
520                 leaf->il_at = lvar_lentry(scan);
521                 if (scan_hash == hash)
522                         return IAM_LOOKUP_EXACT;
523         }
524         assert_inv(n_invariant(leaf));
525         /*
526          * @ik is greater than any key in the node. Return last record in the
527          * node.
528          */
529         return IAM_LOOKUP_OK;
530 }
531
532 static void __lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
533 {
534         memcpy(e_key(n_cur(l)), k, e_keysize(n_cur(l)));
535 }
536
537 static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k)
538 {
539         assert_corr(n_at_rec(l));
540         assert_corr(strlen(kchar(k)) == e_keysize(n_cur(l)));
541         assert_corr(iam_leaf_is_locked(l));
542         __lvar_key_set(l, k);
543         assert_inv(n_invariant(l));
544 }
545
546 static int lvar_key_cmp(const struct iam_leaf *l, const struct iam_key *k)
547 {
548         lvar_hash_t hash;
549         const char *name;
550
551         name = kchar(k);
552
553         hash = get_hash(iam_leaf_container(l), name, strlen(name));
554         return e_cmp(l, n_cur(l), hash);
555 }
556
557 static int lvar_key_eq(const struct iam_leaf *l, const struct iam_key *k)
558 {
559         const char *name;
560
561         name = kchar(k);
562         return e_eq(n_cur(l), name, strlen(name));
563 }
564
565 static void __lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
566 {
567         memcpy(e_rec(n_cur(l)), r, rec_size(r));
568 }
569
570 static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r)
571 {
572         assert_corr(n_at_rec(l));
573         assert_corr(iam_leaf_is_locked(l));
574         __lvar_rec_set(l, r);
575         assert_inv(n_invariant(l));
576 }
577
578 static int lvar_rec_eq(const struct iam_leaf *l, const struct iam_rec *r)
579 {
580         struct iam_rec *rec = e_rec(n_cur(l));
581
582         if (rec_size(rec) != rec_size(r))
583                 return 0;
584         return !memcmp(rec, r, rec_size(r));
585 }
586
587 static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r)
588 {
589         struct iam_rec *rec;
590
591         rec = e_rec(n_cur(l));
592         assert_corr(n_at_rec(l));
593         assert_corr(iam_leaf_is_locked(l));
594         memcpy(r, rec, rec_size(rec));
595         assert_inv(n_invariant(l));
596 }
597
598 static int lvar_can_add(const struct iam_leaf *l,
599                         const struct iam_key *k, const struct iam_rec *r)
600 {
601         assert_corr(iam_leaf_is_locked(l));
602         return h_used(n_head(l)) +
603                 getsize(l, strlen(kchar(k)), rec_size(r)) <= blocksize(l);
604 }
605
606 static int lvar_at_end(const struct iam_leaf *folio)
607 {
608         assert_corr(iam_leaf_is_locked(folio));
609         return n_cur(folio) == n_end(folio);
610 }
611
612 static void lvar_rec_add(struct iam_leaf *leaf,
613                          const struct iam_key *k, const struct iam_rec *r)
614 {
615         const char *key;
616         int ksize;
617         int shift;
618         void *end;
619         void *start;
620         ptrdiff_t diff;
621
622         assert_corr(lvar_can_add(leaf, k, r));
623         assert_inv(n_invariant(leaf));
624         assert_corr(iam_leaf_is_locked(leaf));
625
626         key   = kchar(k);
627         ksize = strlen(key);
628         shift = getsize(leaf, ksize, rec_size(r));
629
630         if (!lvar_at_end(leaf)) {
631                 assert_corr(n_cur(leaf) < n_end(leaf));
632                 end = n_end(leaf);
633                 if (lvar_key_cmp(leaf, k) <= 0)
634                         lvar_next(leaf);
635                 else
636                         /*
637                          * Another exceptional case: insertion with the key
638                          * less than least key in the leaf.
639                          */
640                         assert_corr(leaf->il_at == leaf->il_entries);
641
642                 start = leaf->il_at;
643                 diff  = PDIFF(end, start);
644                 assert_corr(diff >= 0);
645                 memmove(start + shift, start, diff);
646         }
647         h_used_adj(leaf, n_head(leaf), shift);
648         n_cur(leaf)->vle_keysize = cpu_to_le16(ksize);
649         n_cur(leaf)->vle_hash = cpu_to_le32(get_hash(iam_leaf_container(leaf),
650                                             key, ksize));
651         __lvar_key_set(leaf, k);
652         __lvar_rec_set(leaf, r);
653         assert_corr(n_at_rec(leaf));
654         assert_inv(n_invariant(leaf));
655 }
656
657 static void lvar_rec_del(struct iam_leaf *leaf, int shift)
658 {
659         void *next;
660         void *end;
661         int nob;
662
663         assert_corr(n_at_rec(leaf));
664         assert_inv(n_invariant(leaf));
665         assert_corr(iam_leaf_is_locked(leaf));
666
667         end  = n_end(leaf);
668         next = e_next(leaf, n_cur(leaf));
669         nob  = e_size(leaf, n_cur(leaf));
670         memmove(leaf->il_at, next, end - next);
671         h_used_adj(leaf, n_head(leaf), -nob);
672         assert_inv(n_invariant(leaf));
673 }
674
675 static void lvar_init_new(struct iam_container *c, struct buffer_head *bh)
676 {
677         struct lvar_leaf_header *hdr;
678
679         hdr = (struct lvar_leaf_header *)bh->b_data;
680         hdr->vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC);
681         hdr->vlh_used  = sizeof(*hdr);
682 }
683
684 static struct lvar_leaf_entry *find_pivot(const struct iam_leaf *leaf,
685                                           struct lvar_leaf_entry **prev)
686 {
687         void *scan;
688         void *start;
689         int threshold;
690
691         *prev = NULL;
692         threshold = blocksize(leaf) / 2;
693         for (scan = start = n_start(leaf); scan - start <= threshold;
694                         *prev = scan, scan = e_next(leaf, scan)) {
695                 ;
696         }
697         return scan;
698 }
699
700 static void lvar_split(struct iam_leaf *leaf, struct buffer_head **bh,
701                        iam_ptr_t new_blknr)
702 {
703         struct lvar_leaf_entry *first_to_move;
704         struct lvar_leaf_entry *last_to_stay;
705         struct iam_path *path;
706         struct lvar_leaf_header *hdr;
707         struct buffer_head *new_leaf;
708         ptrdiff_t tomove;
709         lvar_hash_t hash;
710
711         assert_inv(n_invariant(leaf));
712         assert_corr(iam_leaf_is_locked(leaf));
713
714         new_leaf = *bh;
715         path = iam_leaf_path(leaf);
716
717         hdr = (void *)new_leaf->b_data;
718
719         first_to_move = find_pivot(leaf, &last_to_stay);
720         assert_corr(last_to_stay != NULL);
721         assert_corr(e_next(leaf, last_to_stay) == first_to_move);
722
723         hash = e_hash(first_to_move);
724         if (hash == e_hash(last_to_stay))
725                 /*
726                  * Duplicate hash.
727                  */
728                 hash |= 1;
729
730         tomove = PDIFF(n_end(leaf), first_to_move);
731         memmove(hdr + 1, first_to_move, tomove);
732
733         h_used_adj(leaf, hdr, tomove);
734         h_used_adj(leaf, n_head(leaf), -tomove);
735
736         assert_corr(n_end(leaf) == first_to_move);
737
738         if (n_cur(leaf) >= first_to_move) {
739                 /*
740                  * insertion point moves into new leaf.
741                  */
742                 ptrdiff_t shift;
743
744                 shift = PDIFF(leaf->il_at, first_to_move);
745                 *bh = leaf->il_bh;
746                 leaf->il_bh = new_leaf;
747                 leaf->il_curidx = new_blknr;
748
749                 assert_corr(iam_leaf_is_locked(leaf));
750                 lvar_init(leaf);
751                 /*
752                  * init cannot fail, as node was just initialized.
753                  */
754                 assert_corr(result == 0);
755                 leaf->il_at = ((void *)leaf->il_at) + shift;
756         }
757         /*
758          * Insert pointer to the new node (together with the least key in
759          * the node) into index node.
760          */
761         iam_insert_key_lock(path, path->ip_frame, (struct iam_ikey *)&hash,
762                             new_blknr);
763         assert_corr(n_cur(leaf) < n_end(leaf));
764         assert_inv(n_invariant(leaf));
765 }
766
767 static int lvar_leaf_empty(struct iam_leaf *leaf)
768 {
769         return h_used(n_head(leaf)) == sizeof(struct lvar_leaf_header);
770 }
771
772 static const struct iam_leaf_operations lvar_leaf_ops = {
773         .init           = lvar_init,
774         .init_new       = lvar_init_new,
775         .fini           = lvar_fini,
776         .start          = lvar_start,
777         .next           = lvar_next,
778         .key            = lvar_key,
779         .ikey           = lvar_ikey,
780         .rec            = lvar_rec,
781         .key_set        = lvar_key_set,
782         .key_cmp        = lvar_key_cmp,
783         .key_eq         = lvar_key_eq,
784         .key_size       = lvar_key_size,
785         .rec_set        = lvar_rec_set,
786         .rec_eq         = lvar_rec_eq,
787         .rec_get        = lvar_rec_get,
788         .lookup         = lvar_lookup,
789         .ilookup        = lvar_ilookup,
790         .at_end         = lvar_at_end,
791         .rec_add        = lvar_rec_add,
792         .rec_del        = lvar_rec_del,
793         .can_add        = lvar_can_add,
794         .split          = lvar_split,
795         .leaf_empty     = lvar_leaf_empty,
796 };
797
798 /*
799  * Index operations.
800  */
801
802 enum {
803         /* This is duplicated in lustre/utils/create_iam.c */
804         /* egrep -i '^o?x?[olabcdef]*$' /usr/share/dict/words */
805         IAM_LVAR_ROOT_MAGIC = 0xb01dface
806 };
807
808 /* This is duplicated in lustre/utils/create_iam.c */
809 struct lvar_root {
810         __le32 vr_magic;
811         __le16 vr_recsize;
812         __le16 vr_ptrsize;
813         u8 vr_indirect_levels;
814         u8 vr_padding0;
815         __le16 vr_padding1;
816 };
817
818 static u32 lvar_root_ptr(struct iam_container *c)
819 {
820         return 0;
821 }
822
823 static int lvar_node_init(struct iam_container *c, struct buffer_head *bh,
824                           int root)
825 {
826         return 0;
827 }
828
829 static struct iam_entry *lvar_root_inc(struct iam_container *c,
830                                        struct iam_path *path,
831                                        struct iam_frame *frame)
832 {
833         struct lvar_root *root;
834         struct iam_entry *entries;
835
836         assert_corr(iam_frame_is_locked(path, frame));
837         entries = frame->entries;
838
839         dx_set_count(entries, 2);
840         assert_corr(dx_get_limit(entries) == dx_root_limit(path));
841
842         root = (void *)frame->bh->b_data;
843         assert_corr(le64_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC);
844         root->vr_indirect_levels++;
845         frame->at = entries = iam_entry_shift(path, entries, 1);
846         memset(iam_ikey_at(path, entries), 0,
847                iam_path_descr(path)->id_ikey_size);
848         return entries;
849 }
850
851 static int lvar_node_check(struct iam_path *path, struct iam_frame *frame)
852 {
853         unsigned int count;
854         unsigned int limit;
855         unsigned int limit_correct;
856         struct iam_entry *entries;
857
858         entries = dx_node_get_entries(path, frame);
859
860         if (frame == path->ip_frames) {
861                 struct lvar_root *root;
862
863                 root = (void *)frame->bh->b_data;
864                 if (le32_to_cpu(root->vr_magic) != IAM_LVAR_ROOT_MAGIC)
865                         return -EIO;
866                 limit_correct = dx_root_limit(path);
867         } else
868                 limit_correct = dx_node_limit(path);
869         count = dx_get_count(entries);
870         limit = dx_get_limit(entries);
871         if (count > limit)
872                 return -EIO;
873         if (limit != limit_correct)
874                 return -EIO;
875         return 0;
876 }
877
878 static int lvar_node_load(struct iam_path *path, struct iam_frame *frame)
879 {
880         struct iam_entry *entries;
881         void *data;
882
883         entries = dx_node_get_entries(path, frame);
884         data = frame->bh->b_data;
885
886         if (frame == path->ip_frames) {
887                 struct lvar_root *root;
888                 const char *name;
889
890                 root = data;
891                 name = kchar(path->ip_key_target);
892                 path->ip_indirect = root->vr_indirect_levels;
893                 if (path->ip_ikey_target == NULL) {
894                         path->ip_ikey_target = iam_path_ikey(path, 4);
895                         *(lvar_hash_t *)path->ip_ikey_target =
896                                 get_hash(path->ip_container, name,
897                                          strlen(name));
898                 }
899         }
900         frame->entries = frame->at = entries;
901         return 0;
902 }
903
904 static int lvar_ikeycmp(const struct iam_container *c,
905                         const struct iam_ikey *k1, const struct iam_ikey *k2)
906 {
907         lvar_hash_t p1 = le32_to_cpu(*(lvar_hash_t *)k1);
908         lvar_hash_t p2 = le32_to_cpu(*(lvar_hash_t *)k2);
909
910         return p1 > p2 ? 1 : (p1 < p2 ? -1 : 0);
911 }
912
913 static struct iam_path_descr *lvar_ipd_alloc(const struct iam_container *c,
914                                              void *area)
915 {
916         return iam_ipd_alloc(area, c->ic_descr->id_ikey_size);
917 }
918
919 static void lvar_root(void *buf,
920                       int blocksize, int keysize, int ptrsize, int recsize)
921 {
922         struct lvar_root *root;
923         struct dx_countlimit *limit;
924         void *entry;
925         int isize;
926
927         isize = sizeof(lvar_hash_t) + ptrsize;
928         root = buf;
929         *root = (typeof(*root)) {
930                 .vr_magic            = cpu_to_le32(IAM_LVAR_ROOT_MAGIC),
931                 .vr_recsize          = cpu_to_le16(recsize),
932                 .vr_ptrsize          = cpu_to_le16(ptrsize),
933                 .vr_indirect_levels  = 0
934         };
935
936         limit = (void *)(root + 1);
937         *limit = (typeof(*limit)){
938                 /*
939                  * limit itself + one pointer to the leaf.
940                  */
941                 .count = cpu_to_le16(2),
942                 .limit = iam_root_limit(sizeof(struct lvar_root), blocksize,
943                                         sizeof(lvar_hash_t) + ptrsize)
944         };
945
946         /* To guarantee that the padding "keysize + ptrsize"
947          * covers the "dx_countlimit" and the "idle_blocks". */
948         LASSERT((keysize + ptrsize) >=
949                 (sizeof(struct dx_countlimit) + sizeof(u32)));
950
951         entry = (void *)(limit + 1);
952         /* Put "idle_blocks" just after the limit. There was padding after
953          * the limit, the "idle_blocks" re-uses part of the padding, so no
954          * compatibility issues with old layout.
955          */
956         *(u32 *)entry = 0;
957
958         /*
959          * Skip over @limit.
960          */
961         entry = (void *)(root + 1) + isize;
962
963         /*
964          * Entry format is <key> followed by <ptr>. In the minimal tree
965          * consisting of a root and single node, <key> is a minimal possible
966          * key.
967          */
968         *(lvar_hash_t *)entry = 0;
969         entry += sizeof(lvar_hash_t);
970         /* now @entry points to <ptr> */
971         if (ptrsize == 4)
972                 *(u_int32_t *)entry = cpu_to_le32(1);
973         else
974                 *(u_int64_t *)entry = cpu_to_le64(1);
975 }
976
977 static int lvar_esize(int namelen, int recsize)
978 {
979         return (offsetof(struct lvar_leaf_entry, vle_key) +
980                         namelen + recsize + LVAR_ROUND) & ~LVAR_ROUND;
981 }
982
983 static void lvar_leaf(void *buf,
984                       int blocksize, int keysize, int ptrsize, int recsize)
985 {
986         struct lvar_leaf_header *head;
987         struct lvar_leaf_entry *entry;
988
989         /* form leaf */
990         head = buf;
991         *head = (typeof(*head)) {
992                 .vlh_magic = cpu_to_le16(IAM_LVAR_LEAF_MAGIC),
993                 .vlh_used  = cpu_to_le16(sizeof(*head) + lvar_esize(0, recsize))
994         };
995         entry = (void *)(head + 1);
996         *entry = (typeof(*entry)) {
997                 .vle_hash    = 0,
998                 .vle_keysize = 0
999         };
1000         memset(e_rec(entry), 0, recsize);
1001         *(char *)e_rec(entry) = recsize;
1002 }
1003
1004 int iam_lvar_create(struct inode *obj,
1005                     int keysize, int ptrsize, int recsize, handle_t *handle)
1006 {
1007         struct buffer_head *root_node;
1008         struct buffer_head *leaf_node;
1009         struct super_block *sb;
1010
1011         u32 blknr;
1012         int result = 0;
1013         unsigned long bsize;
1014
1015         assert_corr(obj->i_size == 0);
1016
1017         sb = obj->i_sb;
1018         bsize = sb->s_blocksize;
1019         root_node = osd_ldiskfs_append(handle, obj, &blknr);
1020         if (IS_ERR(root_node))
1021                 GOTO(out, result = PTR_ERR(root_node));
1022
1023         leaf_node = osd_ldiskfs_append(handle, obj, &blknr);
1024         if (IS_ERR(leaf_node))
1025                 GOTO(out_root, result = PTR_ERR(leaf_node));
1026
1027         lvar_root(root_node->b_data, bsize, keysize, ptrsize, recsize);
1028         lvar_leaf(leaf_node->b_data, bsize, keysize, ptrsize, recsize);
1029         ldiskfs_mark_inode_dirty(handle, obj);
1030         result = ldiskfs_handle_dirty_metadata(handle, NULL, root_node);
1031         if (result == 0)
1032                 result = ldiskfs_handle_dirty_metadata(handle, NULL, leaf_node);
1033         if (result != 0)
1034                 ldiskfs_std_error(sb, result);
1035
1036         brelse(leaf_node);
1037
1038         GOTO(out_root, result);
1039
1040 out_root:
1041         brelse(root_node);
1042 out:
1043         return result;
1044 }
1045
1046 static const struct iam_operations lvar_ops = {
1047         .id_root_ptr    = lvar_root_ptr,
1048         .id_node_read   = iam_node_read,
1049         .id_node_init   = lvar_node_init,
1050         .id_node_check  = lvar_node_check,
1051         .id_node_load   = lvar_node_load,
1052         .id_ikeycmp     = lvar_ikeycmp,
1053         .id_root_inc    = lvar_root_inc,
1054         .id_ipd_alloc   = lvar_ipd_alloc,
1055         .id_ipd_free    = iam_ipd_free,
1056         .id_name        = "lvar"
1057 };
1058
1059 int iam_lvar_guess(struct iam_container *c)
1060 {
1061         int result;
1062         struct buffer_head *bh;
1063         const struct lvar_root *root;
1064
1065         assert_corr(c->ic_object != NULL);
1066
1067         result = iam_node_read(c, lvar_root_ptr(c), NULL, &bh);
1068         if (result == 0) {
1069                 root = (void *)bh->b_data;
1070
1071                 if (le32_to_cpu(root->vr_magic) == IAM_LVAR_ROOT_MAGIC) {
1072                         struct iam_descr *descr;
1073
1074                         descr = c->ic_descr;
1075                         descr->id_key_size  = LDISKFS_NAME_LEN;
1076                         descr->id_ikey_size = sizeof(lvar_hash_t);
1077                         descr->id_rec_size  = le16_to_cpu(root->vr_recsize);
1078                         descr->id_ptr_size  = le16_to_cpu(root->vr_ptrsize);
1079                         descr->id_root_gap  = sizeof(*root);
1080                         descr->id_node_gap  = 0;
1081                         descr->id_ops       = &lvar_ops;
1082                         descr->id_leaf_ops  = &lvar_leaf_ops;
1083                         c->ic_root_bh = bh;
1084                 } else {
1085                         result = -EBADF;
1086                         brelse(bh);
1087                 }
1088         }
1089         return result;
1090 }