4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright (c) 2014, 2016, Intel Corporation.
26 * lustre/include/lustre_lmv.h
28 * Lustre LMV structures and functions.
30 * Author: Di Wang <di.wang@intel.com>
35 #include <uapi/linux/lustre/lustre_idl.h>
38 struct lu_fid lmo_fid;
40 struct inode *lmo_root;
43 struct lmv_stripe_md {
45 __u32 lsm_md_stripe_count;
46 __u32 lsm_md_master_mdt_index;
47 __u32 lsm_md_hash_type;
48 __u8 lsm_md_max_inherit;
49 __u8 lsm_md_max_inherit_rr;
50 __u32 lsm_md_layout_version;
51 __u32 lsm_md_migrate_offset;
52 __u32 lsm_md_migrate_hash;
53 char lsm_md_pool_name[LOV_MAXPOOLNAME + 1];
54 struct lmv_oinfo lsm_md_oinfo[0];
57 struct lmv_stripe_object {
60 struct lmv_stripe_md lso_lsm;
61 struct lmv_foreign_md lso_lfm;
65 static inline bool lmv_dir_striped(const struct lmv_stripe_object *lso)
67 return lso && lso->lso_lsm.lsm_md_magic == LMV_MAGIC;
70 static inline bool lmv_dir_foreign(const struct lmv_stripe_object *lso)
72 return lso && lso->lso_lsm.lsm_md_magic == LMV_MAGIC_FOREIGN;
75 static inline bool lmv_dir_layout_changing(const struct lmv_stripe_object *lso)
77 return lmv_dir_striped(lso) &&
78 lmv_hash_is_layout_changing(lso->lso_lsm.lsm_md_hash_type);
81 static inline bool lmv_dir_bad_hash(const struct lmv_stripe_object *lso)
83 if (!lmv_dir_striped(lso))
86 if (lso->lso_lsm.lsm_md_hash_type & LMV_HASH_FLAG_BAD_TYPE)
89 return !lmv_is_known_hash_type(lso->lso_lsm.lsm_md_hash_type);
92 static inline __u8 lmv_inherit_next(__u8 inherit)
94 if (inherit == LMV_INHERIT_END || inherit == LMV_INHERIT_NONE)
95 return LMV_INHERIT_NONE;
97 if (inherit == LMV_INHERIT_UNLIMITED || inherit > LMV_INHERIT_MAX)
103 static inline __u8 lmv_inherit_rr_next(__u8 inherit_rr)
105 if (inherit_rr == LMV_INHERIT_RR_NONE ||
106 inherit_rr == LMV_INHERIT_RR_UNLIMITED ||
107 inherit_rr > LMV_INHERIT_RR_MAX)
110 return inherit_rr - 1;
113 static inline bool lmv_is_inheritable(__u8 inherit)
115 return inherit == LMV_INHERIT_UNLIMITED ||
116 (inherit > LMV_INHERIT_END && inherit <= LMV_INHERIT_MAX);
119 static inline bool lsm_md_eq(const struct lmv_stripe_object *lso1,
120 const struct lmv_stripe_object *lso2)
122 const struct lmv_stripe_md *lsm1 = &lso1->lso_lsm;
123 const struct lmv_stripe_md *lsm2 = &lso2->lso_lsm;
126 if (lsm1->lsm_md_magic != lsm2->lsm_md_magic ||
127 lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count ||
128 lsm1->lsm_md_master_mdt_index !=
129 lsm2->lsm_md_master_mdt_index ||
130 lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type ||
131 lsm1->lsm_md_max_inherit != lsm2->lsm_md_max_inherit ||
132 lsm1->lsm_md_max_inherit_rr != lsm2->lsm_md_max_inherit_rr ||
133 lsm1->lsm_md_layout_version !=
134 lsm2->lsm_md_layout_version ||
135 lsm1->lsm_md_migrate_offset !=
136 lsm2->lsm_md_migrate_offset ||
137 lsm1->lsm_md_migrate_hash !=
138 lsm2->lsm_md_migrate_hash ||
139 strncmp(lsm1->lsm_md_pool_name, lsm2->lsm_md_pool_name,
140 sizeof(lsm1->lsm_md_pool_name)) != 0)
143 if (lmv_dir_striped(lso1)) {
144 for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
145 if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid,
146 &lsm2->lsm_md_oinfo[idx].lmo_fid))
149 } else if (lsm1->lsm_md_magic == LMV_USER_MAGIC_SPECIFIC) {
150 for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) {
151 if (lsm1->lsm_md_oinfo[idx].lmo_mds !=
152 lsm2->lsm_md_oinfo[idx].lmo_mds)
161 lmv_stripe_object_dump(int mask, const struct lmv_stripe_object *lsmo)
163 const struct lmv_stripe_md *lsm = &lsmo->lso_lsm;
167 "dump LMV: refs %u magic=%#x count=%u index=%u hash=%s:%#x max_inherit=%hhu max_inherit_rr=%hhu version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n",
168 lsm->lsm_md_magic, atomic_read(&lsmo->lso_refs),
169 lsm->lsm_md_stripe_count, lsm->lsm_md_master_mdt_index,
170 lmv_is_known_hash_type(lsm->lsm_md_hash_type) ?
171 mdt_hash_name[lsm->lsm_md_hash_type & LMV_HASH_TYPE_MASK] :
172 "invalid", lsm->lsm_md_hash_type,
173 lsm->lsm_md_max_inherit, lsm->lsm_md_max_inherit_rr,
174 lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset,
175 lmv_is_known_hash_type(lsm->lsm_md_migrate_hash) ?
176 mdt_hash_name[lsm->lsm_md_migrate_hash & LMV_HASH_TYPE_MASK] :
177 "invalid", lsm->lsm_md_migrate_hash,
178 LOV_MAXPOOLNAME, lsm->lsm_md_pool_name);
180 if (!lmv_dir_striped(lsmo))
183 for (i = 0; i < lsm->lsm_md_stripe_count; i++)
184 CDEBUG_LIMIT(mask, "stripe[%d] "DFID"\n",
185 i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid));
189 lmv_object_inherited(const struct lmv_stripe_object *plsm,
190 const struct lmv_stripe_object *clsm)
192 return plsm && clsm &&
193 plsm->lso_lsm.lsm_md_magic ==
194 clsm->lso_lsm.lsm_md_magic &&
195 plsm->lso_lsm.lsm_md_stripe_count ==
196 clsm->lso_lsm.lsm_md_stripe_count &&
197 plsm->lso_lsm.lsm_md_master_mdt_index ==
198 clsm->lso_lsm.lsm_md_master_mdt_index &&
199 plsm->lso_lsm.lsm_md_hash_type ==
200 clsm->lso_lsm.lsm_md_hash_type &&
201 lmv_inherit_next(plsm->lso_lsm.lsm_md_max_inherit) ==
202 clsm->lso_lsm.lsm_md_max_inherit &&
203 lmv_inherit_rr_next(plsm->lso_lsm.lsm_md_max_inherit_rr) ==
204 clsm->lso_lsm.lsm_md_max_inherit_rr;
209 struct lmv_stripe_object *lmv_stripe_object_alloc(__u32 magic,
210 const union lmv_mds_md *lmm,
213 void lmv_stripe_object_put(struct lmv_stripe_object **lsm_obj);
215 struct lmv_stripe_object *
216 lmv_stripe_object_get(struct lmv_stripe_object *lsm_obj);
218 static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst,
219 const struct lmv_mds_md_v1 *lmv_src)
223 lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic);
224 lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count);
225 lmv_dst->lmv_master_mdt_index =
226 le32_to_cpu(lmv_src->lmv_master_mdt_index);
227 lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type);
228 lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version);
229 if (lmv_src->lmv_stripe_count > LMV_MAX_STRIPE_COUNT)
231 for (i = 0; i < lmv_src->lmv_stripe_count; i++)
232 fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i],
233 &lmv_src->lmv_stripe_fids[i]);
236 static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst,
237 const union lmv_mds_md *lmv_src)
239 switch (le32_to_cpu(lmv_src->lmv_magic)) {
241 lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1);
248 /* This hash is only for testing purpose */
249 static inline unsigned int
250 lmv_hash_all_chars(unsigned int count, const char *name, int namelen)
253 const unsigned char *p = (const unsigned char *)name;
255 while (--namelen >= 0)
263 static inline unsigned int
264 lmv_hash_fnv1a(unsigned int count, const char *name, int namelen)
268 hash = lustre_hash_fnv_1a_64(name, namelen);
270 return do_div(hash, count);
274 * Robert Jenkins' function for mixing 32-bit values
275 * http://burtleburtle.net/bob/hash/evahash.html
276 * a, b = random bits, c = input and output
278 * Mixing inputs to generate an evenly distributed hash.
280 #define crush_hashmix(a, b, c) \
282 a = a - b; a = a - c; a = a ^ (c >> 13); \
283 b = b - c; b = b - a; b = b ^ (a << 8); \
284 c = c - a; c = c - b; c = c ^ (b >> 13); \
285 a = a - b; a = a - c; a = a ^ (c >> 12); \
286 b = b - c; b = b - a; b = b ^ (a << 16); \
287 c = c - a; c = c - b; c = c ^ (b >> 5); \
288 a = a - b; a = a - c; a = a ^ (c >> 3); \
289 b = b - c; b = b - a; b = b ^ (a << 10); \
290 c = c - a; c = c - b; c = c ^ (b >> 15); \
293 #define crush_hash_seed 1315423911
295 static inline __u32 crush_hash(__u32 a, __u32 b)
297 __u32 hash = crush_hash_seed ^ a ^ b;
301 crush_hashmix(a, b, hash);
302 crush_hashmix(x, a, hash);
303 crush_hashmix(b, y, hash);
308 /* refer to https://github.com/ceph/ceph/blob/master/src/crush/hash.c and
309 * https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf for details of CRUSH
312 static inline unsigned int
313 lmv_hash_crush(unsigned int count, const char *name, int namelen, bool crush2)
315 unsigned long long straw;
316 unsigned long long highest_straw = 0;
318 unsigned int idx = 0;
321 /* put temp and backup file on the same MDT where target is located.
322 * temporary file naming rule:
323 * 1. rsync: .<target>.XXXXXX
324 * 2. dstripe: <target>.XXXXXXXX
326 if (lu_name_is_temp_file(name, namelen, true, 6, crush2)) {
329 } else if (lu_name_is_temp_file(name, namelen, false, 8, crush2)) {
331 } else if (lu_name_is_backup_file(name, namelen, &i)) {
332 LASSERT(i < namelen);
336 pg_id = lmv_hash_fnv1a(LMV_CRUSH_PG_COUNT, name, namelen);
338 /* distribute PG among all stripes pseudo-randomly, so they are almost
339 * evenly distributed, and when stripe count changes, only (delta /
340 * total) sub files need to be moved, herein 'delta' is added or removed
341 * stripe count, 'total' is total stripe count before change for
342 * removal, or count after change for addition.
344 for (i = 0; i < count; i++) {
345 straw = crush_hash(pg_id, i);
346 if (straw > highest_straw) {
347 highest_straw = straw;
351 LASSERT(idx < count);
356 /* directory layout may change in three ways:
357 * 1. directory migration, in its LMV source stripes are appended after
358 * target stripes, \a migrate_hash is source hash type, \a migrate_offset is
359 * target stripe count,
360 * 2. directory split, \a migrate_hash is hash type before split,
361 * \a migrate_offset is stripe count before split.
362 * 3. directory merge, \a migrate_hash is hash type after merge,
363 * \a migrate_offset is stripe count after merge.
366 __lmv_name_to_stripe_index(__u32 hash_type, __u32 stripe_count,
367 __u32 migrate_hash, __u32 migrate_offset,
368 const char *name, int namelen, bool new_layout)
370 __u32 saved_hash = hash_type;
371 __u32 saved_count = stripe_count;
372 int stripe_index = 0;
374 LASSERT(namelen > 0);
375 LASSERT(stripe_count > 0);
377 if (lmv_hash_is_splitting(hash_type)) {
379 hash_type = migrate_hash;
380 stripe_count = migrate_offset;
382 } else if (lmv_hash_is_merging(hash_type)) {
384 hash_type = migrate_hash;
385 stripe_count = migrate_offset;
387 } else if (lmv_hash_is_migrating(hash_type)) {
389 stripe_count = migrate_offset;
391 hash_type = migrate_hash;
392 stripe_count -= migrate_offset;
396 if (stripe_count > 1) {
397 switch (hash_type & LMV_HASH_TYPE_MASK) {
398 case LMV_HASH_TYPE_ALL_CHARS:
399 stripe_index = lmv_hash_all_chars(stripe_count, name,
402 case LMV_HASH_TYPE_FNV_1A_64:
403 stripe_index = lmv_hash_fnv1a(stripe_count, name,
406 case LMV_HASH_TYPE_CRUSH:
407 stripe_index = lmv_hash_crush(stripe_count, name,
410 case LMV_HASH_TYPE_CRUSH2:
411 stripe_index = lmv_hash_crush(stripe_count, name,
419 LASSERT(stripe_index < stripe_count);
421 if (!new_layout && lmv_hash_is_migrating(saved_hash))
422 stripe_index += migrate_offset;
424 LASSERT(stripe_index < saved_count);
426 CDEBUG(D_INFO, "name %.*s hash=%#x/%#x idx=%d/%u/%u under %s layout\n",
427 namelen, name, saved_hash, migrate_hash, stripe_index,
428 saved_count, migrate_offset, new_layout ? "new" : "old");
433 static inline int lmv_name_to_stripe_index(struct lmv_mds_md_v1 *lmv,
434 const char *name, int namelen)
436 if (lmv->lmv_magic == LMV_MAGIC_V1 ||
437 lmv->lmv_magic == LMV_MAGIC_STRIPE)
438 return __lmv_name_to_stripe_index(lmv->lmv_hash_type,
439 lmv->lmv_stripe_count,
440 lmv->lmv_migrate_hash,
441 lmv->lmv_migrate_offset,
442 name, namelen, true);
444 if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1) ||
445 lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_STRIPE))
446 return __lmv_name_to_stripe_index(
447 le32_to_cpu(lmv->lmv_hash_type),
448 le32_to_cpu(lmv->lmv_stripe_count),
449 le32_to_cpu(lmv->lmv_migrate_hash),
450 le32_to_cpu(lmv->lmv_migrate_offset),
451 name, namelen, true);
456 static inline int lmv_name_to_stripe_index_old(struct lmv_mds_md_v1 *lmv,
457 const char *name, int namelen)
459 if (lmv->lmv_magic == LMV_MAGIC_V1 ||
460 lmv->lmv_magic == LMV_MAGIC_STRIPE)
461 return __lmv_name_to_stripe_index(lmv->lmv_hash_type,
462 lmv->lmv_stripe_count,
463 lmv->lmv_migrate_hash,
464 lmv->lmv_migrate_offset,
465 name, namelen, false);
467 if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1) ||
468 lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_STRIPE))
469 return __lmv_name_to_stripe_index(
470 le32_to_cpu(lmv->lmv_hash_type),
471 le32_to_cpu(lmv->lmv_stripe_count),
472 le32_to_cpu(lmv->lmv_migrate_hash),
473 le32_to_cpu(lmv->lmv_migrate_offset),
474 name, namelen, false);
479 static inline bool lmv_user_magic_supported(__u32 lum_magic)
481 return lum_magic == LMV_USER_MAGIC ||
482 lum_magic == LMV_USER_MAGIC_SPECIFIC ||
483 lum_magic == LMV_MAGIC_FOREIGN;
486 #define LMV_DEBUG(mask, lmv, msg) \
488 "%s LMV: magic=%#x count=%u index=%u hash=%s:%#x version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n",\
489 msg, (lmv)->lmv_magic, (lmv)->lmv_stripe_count, \
490 (lmv)->lmv_master_mdt_index, \
491 lmv_is_known_hash_type((lmv)->lmv_hash_type) ? \
492 mdt_hash_name[(lmv)->lmv_hash_type & LMV_HASH_TYPE_MASK] : \
493 "invalid", (lmv)->lmv_hash_type, \
494 (lmv)->lmv_layout_version, (lmv)->lmv_migrate_offset, \
495 lmv_is_known_hash_type((lmv)->lmv_migrate_hash) ? \
496 mdt_hash_name[(lmv)->lmv_migrate_hash & LMV_HASH_TYPE_MASK] : \
497 "invalid", (lmv)->lmv_migrate_hash, \
498 LOV_MAXPOOLNAME, lmv->lmv_pool_name)
500 /* master LMV is sane */
501 static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv)
506 if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1)
509 if (le32_to_cpu(lmv->lmv_stripe_count) == 0)
512 if (!lmv_is_sane_hash_type(le32_to_cpu(lmv->lmv_hash_type)))
517 LMV_DEBUG(D_ERROR, lmv, "unknown layout");
521 /* LMV can be either master or stripe LMV */
522 static inline bool lmv_is_sane2(const struct lmv_mds_md_v1 *lmv)
527 if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1 &&
528 le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_STRIPE)
531 if (le32_to_cpu(lmv->lmv_stripe_count) == 0)
534 if (!lmv_is_sane_hash_type(le32_to_cpu(lmv->lmv_hash_type)))
539 LMV_DEBUG(D_ERROR, lmv, "unknown layout");
543 static inline bool lmv_is_splitting(const struct lmv_mds_md_v1 *lmv)
545 if (!lmv_is_sane2(lmv))
548 return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type));
551 static inline bool lmv_is_merging(const struct lmv_mds_md_v1 *lmv)
553 if (!lmv_is_sane2(lmv))
556 return lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type));
559 static inline bool lmv_is_migrating(const struct lmv_mds_md_v1 *lmv)
561 if (!lmv_is_sane(lmv))
564 return lmv_hash_is_migrating(cpu_to_le32(lmv->lmv_hash_type));
567 static inline bool lmv_is_restriping(const struct lmv_mds_md_v1 *lmv)
569 if (!lmv_is_sane2(lmv))
572 return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)) ||
573 lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type));
576 static inline bool lmv_is_layout_changing(const struct lmv_mds_md_v1 *lmv)
578 if (!lmv_is_sane2(lmv))
581 return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)) ||
582 lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type)) ||
583 lmv_hash_is_migrating(cpu_to_le32(lmv->lmv_hash_type));
586 static inline bool lmv_is_fixed(const struct lmv_mds_md_v1 *lmv)
588 return cpu_to_le32(lmv->lmv_hash_type) & LMV_HASH_FLAG_FIXED;