X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Finclude%2Flustre_lmv.h;h=aafaed3e2edc81ff5f4950a8685e981e3ab7345c;hb=1ac4b9598ad6e2f94c4c672b4733186364255c6a;hp=ce0bfa641f28c8d50ae3d0af8eca0d673df1eece;hpb=5f3e926ac9ff8ad134ad920d0e8545e16395ef3b;p=fs%2Flustre-release.git diff --git a/lustre/include/lustre_lmv.h b/lustre/include/lustre_lmv.h index ce0bfa6..aafaed3 100644 --- a/lustre/include/lustre_lmv.h +++ b/lustre/include/lustre_lmv.h @@ -20,7 +20,7 @@ * GPL HEADER END */ /* - * Copyright (c) 2013, Intel Corporation. + * Copyright (c) 2014, 2016, Intel Corporation. */ /* * lustre/include/lustre_lmv.h @@ -32,11 +32,11 @@ #ifndef _LUSTRE_LMV_H #define _LUSTRE_LMV_H -#include +#include struct lmv_oinfo { struct lu_fid lmo_fid; - mdsno_t lmo_mds; + u32 lmo_mds; struct inode *lmo_root; }; @@ -45,42 +45,117 @@ struct lmv_stripe_md { __u32 lsm_md_stripe_count; __u32 lsm_md_master_mdt_index; __u32 lsm_md_hash_type; + __u8 lsm_md_max_inherit; + __u8 lsm_md_max_inherit_rr; __u32 lsm_md_layout_version; + __u32 lsm_md_migrate_offset; + __u32 lsm_md_migrate_hash; __u32 lsm_md_default_count; __u32 lsm_md_default_index; - char lsm_md_pool_name[LOV_MAXPOOLNAME]; + char lsm_md_pool_name[LOV_MAXPOOLNAME + 1]; struct lmv_oinfo lsm_md_oinfo[0]; }; -union lmv_mds_md; +static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm) +{ + return lsm && lsm->lsm_md_magic == LMV_MAGIC; +} -int lmv_pack_md(union lmv_mds_md **lmmp, const struct lmv_stripe_md *lsm, - int stripe_count); -int lmv_alloc_md(union lmv_mds_md **lmmp, int stripe_count); -void lmv_free_md(union lmv_mds_md *lmm); -int lmv_alloc_memmd(struct lmv_stripe_md **lsmp, int stripe_count); -void lmv_free_memmd(struct lmv_stripe_md *lsm); +static inline bool lmv_dir_foreign(const struct lmv_stripe_md *lsm) +{ + return lsm && lsm->lsm_md_magic == LMV_MAGIC_FOREIGN; +} -static inline void lmv1_cpu_to_le(struct lmv_mds_md_v1 *lmv_dst, - const struct lmv_mds_md_v1 *lmv_src) +static inline bool lmv_dir_layout_changing(const struct lmv_stripe_md *lsm) +{ + return lmv_dir_striped(lsm) && + lmv_hash_is_layout_changing(lsm->lsm_md_hash_type); +} + +static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm) +{ + if (!lmv_dir_striped(lsm)) + return false; + + if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_BAD_TYPE) + return true; + + return !lmv_is_known_hash_type(lsm->lsm_md_hash_type); +} + +static inline bool +lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2) +{ + __u32 idx; + + if (lsm1->lsm_md_magic != lsm2->lsm_md_magic || + lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count || + lsm1->lsm_md_master_mdt_index != + lsm2->lsm_md_master_mdt_index || + lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type || + lsm1->lsm_md_max_inherit != lsm2->lsm_md_max_inherit || + lsm1->lsm_md_max_inherit_rr != lsm2->lsm_md_max_inherit_rr || + lsm1->lsm_md_layout_version != + lsm2->lsm_md_layout_version || + lsm1->lsm_md_migrate_offset != + lsm2->lsm_md_migrate_offset || + lsm1->lsm_md_migrate_hash != + lsm2->lsm_md_migrate_hash || + strncmp(lsm1->lsm_md_pool_name, lsm2->lsm_md_pool_name, + sizeof(lsm1->lsm_md_pool_name)) != 0) + return false; + + if (lmv_dir_striped(lsm1)) { + for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { + if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid, + &lsm2->lsm_md_oinfo[idx].lmo_fid)) + return false; + } + } else if (lsm1->lsm_md_magic == LMV_USER_MAGIC_SPECIFIC) { + for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { + if (lsm1->lsm_md_oinfo[idx].lmo_mds != + lsm2->lsm_md_oinfo[idx].lmo_mds) + return false; + } + } + + return true; +} + +static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm) { int i; - lmv_dst->lmv_magic = cpu_to_le32(lmv_src->lmv_magic); - lmv_dst->lmv_stripe_count = cpu_to_le32(lmv_src->lmv_stripe_count); - lmv_dst->lmv_master_mdt_index = - cpu_to_le32(lmv_src->lmv_master_mdt_index); - lmv_dst->lmv_hash_type = cpu_to_le32(lmv_src->lmv_hash_type); - lmv_dst->lmv_layout_version = cpu_to_le32(lmv_src->lmv_layout_version); - for (i = 0; i < lmv_src->lmv_stripe_count; i++) - fid_cpu_to_le(&lmv_dst->lmv_stripe_fids[i], - &lmv_src->lmv_stripe_fids[i]); + CDEBUG_LIMIT(mask, + "dump LMV: magic=%#x count=%u index=%u hash=%s:%#x max_inherit=%hhu max_inherit_rr=%hhu version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n", + lsm->lsm_md_magic, lsm->lsm_md_stripe_count, + lsm->lsm_md_master_mdt_index, + lmv_is_known_hash_type(lsm->lsm_md_hash_type) ? + mdt_hash_name[lsm->lsm_md_hash_type & LMV_HASH_TYPE_MASK] : + "invalid", lsm->lsm_md_hash_type, + lsm->lsm_md_max_inherit, lsm->lsm_md_max_inherit_rr, + lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset, + lmv_is_known_hash_type(lsm->lsm_md_migrate_hash) ? + mdt_hash_name[lsm->lsm_md_migrate_hash & LMV_HASH_TYPE_MASK] : + "invalid", lsm->lsm_md_migrate_hash, + LOV_MAXPOOLNAME, lsm->lsm_md_pool_name); + + if (!lmv_dir_striped(lsm)) + return; + + for (i = 0; i < lsm->lsm_md_stripe_count; i++) + CDEBUG(mask, "stripe[%d] "DFID"\n", + i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid)); } +union lmv_mds_md; + +void lmv_free_memmd(struct lmv_stripe_md *lsm); + static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst, const struct lmv_mds_md_v1 *lmv_src) { - int i; + __u32 i; lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic); lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count); @@ -88,32 +163,364 @@ static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst, le32_to_cpu(lmv_src->lmv_master_mdt_index); lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type); lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version); + if (lmv_src->lmv_stripe_count > LMV_MAX_STRIPE_COUNT) + return; for (i = 0; i < lmv_src->lmv_stripe_count; i++) fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i], &lmv_src->lmv_stripe_fids[i]); } -static inline void lmv_cpu_to_le(union lmv_mds_md *lmv_dst, +static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst, const union lmv_mds_md *lmv_src) { - switch (lmv_src->lmv_magic) { + switch (le32_to_cpu(lmv_src->lmv_magic)) { case LMV_MAGIC_V1: - lmv1_cpu_to_le(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1); + lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1); break; default: break; } } -static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst, - const union lmv_mds_md *lmv_src) +/* This hash is only for testing purpose */ +static inline unsigned int +lmv_hash_all_chars(unsigned int count, const char *name, int namelen) { - switch (le32_to_cpu(lmv_src->lmv_magic)) { - case LMV_MAGIC_V1: - lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1); - break; - default: - break; + unsigned int c = 0; + const unsigned char *p = (const unsigned char *)name; + + while (--namelen >= 0) + c += p[namelen]; + + c = c % count; + + return c; +} + +static inline unsigned int +lmv_hash_fnv1a(unsigned int count, const char *name, int namelen) +{ + __u64 hash; + + hash = lustre_hash_fnv_1a_64(name, namelen); + + return do_div(hash, count); +} + +/* + * Robert Jenkins' function for mixing 32-bit values + * http://burtleburtle.net/bob/hash/evahash.html + * a, b = random bits, c = input and output + * + * Mixing inputs to generate an evenly distributed hash. + */ +#define crush_hashmix(a, b, c) \ +do { \ + a = a - b; a = a - c; a = a ^ (c >> 13); \ + b = b - c; b = b - a; b = b ^ (a << 8); \ + c = c - a; c = c - b; c = c ^ (b >> 13); \ + a = a - b; a = a - c; a = a ^ (c >> 12); \ + b = b - c; b = b - a; b = b ^ (a << 16); \ + c = c - a; c = c - b; c = c ^ (b >> 5); \ + a = a - b; a = a - c; a = a ^ (c >> 3); \ + b = b - c; b = b - a; b = b ^ (a << 10); \ + c = c - a; c = c - b; c = c ^ (b >> 15); \ +} while (0) + +#define crush_hash_seed 1315423911 + +static inline __u32 crush_hash(__u32 a, __u32 b) +{ + __u32 hash = crush_hash_seed ^ a ^ b; + __u32 x = 231232; + __u32 y = 1232; + + crush_hashmix(a, b, hash); + crush_hashmix(x, a, hash); + crush_hashmix(b, y, hash); + + return hash; +} + +/* refer to https://github.com/ceph/ceph/blob/master/src/crush/hash.c and + * https://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf for details of CRUSH + * algorithm. + */ +static inline unsigned int +lmv_hash_crush(unsigned int count, const char *name, int namelen, bool crush2) +{ + unsigned long long straw; + unsigned long long highest_straw = 0; + unsigned int pg_id; + unsigned int idx = 0; + int i; + + /* put temp and backup file on the same MDT where target is located. + * temporary file naming rule: + * 1. rsync: ..XXXXXX + * 2. dstripe: .XXXXXXXX + */ + if (lu_name_is_temp_file(name, namelen, true, 6, crush2)) { + name++; + namelen -= 8; + } else if (lu_name_is_temp_file(name, namelen, false, 8, crush2)) { + namelen -= 9; + } else if (lu_name_is_backup_file(name, namelen, &i)) { + LASSERT(i < namelen); + namelen -= i; } + + pg_id = lmv_hash_fnv1a(LMV_CRUSH_PG_COUNT, name, namelen); + + /* distribute PG among all stripes pseudo-randomly, so they are almost + * evenly distributed, and when stripe count changes, only (delta / + * total) sub files need to be moved, herein 'delta' is added or removed + * stripe count, 'total' is total stripe count before change for + * removal, or count after change for addition. + */ + for (i = 0; i < count; i++) { + straw = crush_hash(pg_id, i); + if (straw > highest_straw) { + highest_straw = straw; + idx = i; + } + } + LASSERT(idx < count); + + return idx; +} + +/* directory layout may change in three ways: + * 1. directory migration, in its LMV source stripes are appended after + * target stripes, \a migrate_hash is source hash type, \a migrate_offset is + * target stripe count, + * 2. directory split, \a migrate_hash is hash type before split, + * \a migrate_offset is stripe count before split. + * 3. directory merge, \a migrate_hash is hash type after merge, + * \a migrate_offset is stripe count after merge. + */ +static inline int +__lmv_name_to_stripe_index(__u32 hash_type, __u32 stripe_count, + __u32 migrate_hash, __u32 migrate_offset, + const char *name, int namelen, bool new_layout) +{ + __u32 saved_hash = hash_type; + __u32 saved_count = stripe_count; + int stripe_index = 0; + + LASSERT(namelen > 0); + LASSERT(stripe_count > 0); + + if (lmv_hash_is_splitting(hash_type)) { + if (!new_layout) { + hash_type = migrate_hash; + stripe_count = migrate_offset; + } + } else if (lmv_hash_is_merging(hash_type)) { + if (new_layout) { + hash_type = migrate_hash; + stripe_count = migrate_offset; + } + } else if (lmv_hash_is_migrating(hash_type)) { + if (new_layout) { + stripe_count = migrate_offset; + } else { + hash_type = migrate_hash; + stripe_count -= migrate_offset; + } + } + + if (stripe_count > 1) { + switch (hash_type & LMV_HASH_TYPE_MASK) { + case LMV_HASH_TYPE_ALL_CHARS: + stripe_index = lmv_hash_all_chars(stripe_count, name, + namelen); + break; + case LMV_HASH_TYPE_FNV_1A_64: + stripe_index = lmv_hash_fnv1a(stripe_count, name, + namelen); + break; + case LMV_HASH_TYPE_CRUSH: + stripe_index = lmv_hash_crush(stripe_count, name, + namelen, false); + break; + case LMV_HASH_TYPE_CRUSH2: + stripe_index = lmv_hash_crush(stripe_count, name, + namelen, true); + break; + default: + return -EBADFD; + } + } + + LASSERT(stripe_index < stripe_count); + + if (!new_layout && lmv_hash_is_migrating(saved_hash)) + stripe_index += migrate_offset; + + LASSERT(stripe_index < saved_count); + + CDEBUG(D_INFO, "name %.*s hash=%#x/%#x idx=%d/%u/%u under %s layout\n", + namelen, name, saved_hash, migrate_hash, stripe_index, + saved_count, migrate_offset, new_layout ? "new" : "old"); + + return stripe_index; +} + +static inline int lmv_name_to_stripe_index(struct lmv_mds_md_v1 *lmv, + const char *name, int namelen) +{ + if (lmv->lmv_magic == LMV_MAGIC_V1) + return __lmv_name_to_stripe_index(lmv->lmv_hash_type, + lmv->lmv_stripe_count, + lmv->lmv_migrate_hash, + lmv->lmv_migrate_offset, + name, namelen, true); + + if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1)) + return __lmv_name_to_stripe_index( + le32_to_cpu(lmv->lmv_hash_type), + le32_to_cpu(lmv->lmv_stripe_count), + le32_to_cpu(lmv->lmv_migrate_hash), + le32_to_cpu(lmv->lmv_migrate_offset), + name, namelen, true); + + return -EINVAL; +} + +static inline int lmv_name_to_stripe_index_old(struct lmv_mds_md_v1 *lmv, + const char *name, int namelen) +{ + if (lmv->lmv_magic == LMV_MAGIC_V1 || + lmv->lmv_magic == LMV_MAGIC_STRIPE) + return __lmv_name_to_stripe_index(lmv->lmv_hash_type, + lmv->lmv_stripe_count, + lmv->lmv_migrate_hash, + lmv->lmv_migrate_offset, + name, namelen, false); + + if (lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_V1) || + lmv->lmv_magic == cpu_to_le32(LMV_MAGIC_STRIPE)) + return __lmv_name_to_stripe_index( + le32_to_cpu(lmv->lmv_hash_type), + le32_to_cpu(lmv->lmv_stripe_count), + le32_to_cpu(lmv->lmv_migrate_hash), + le32_to_cpu(lmv->lmv_migrate_offset), + name, namelen, false); + + return -EINVAL; +} + +static inline bool lmv_user_magic_supported(__u32 lum_magic) +{ + return lum_magic == LMV_USER_MAGIC || + lum_magic == LMV_USER_MAGIC_SPECIFIC || + lum_magic == LMV_MAGIC_FOREIGN; +} + +#define LMV_DEBUG(mask, lmv, msg) \ + CDEBUG_LIMIT(mask, \ + "%s LMV: magic=%#x count=%u index=%u hash=%s:%#x version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n",\ + msg, (lmv)->lmv_magic, (lmv)->lmv_stripe_count, \ + (lmv)->lmv_master_mdt_index, \ + lmv_is_known_hash_type((lmv)->lmv_hash_type) ? \ + mdt_hash_name[(lmv)->lmv_hash_type & LMV_HASH_TYPE_MASK] : \ + "invalid", (lmv)->lmv_hash_type, \ + (lmv)->lmv_layout_version, (lmv)->lmv_migrate_offset, \ + lmv_is_known_hash_type((lmv)->lmv_migrate_hash) ? \ + mdt_hash_name[(lmv)->lmv_migrate_hash & LMV_HASH_TYPE_MASK] : \ + "invalid", (lmv)->lmv_migrate_hash, \ + LOV_MAXPOOLNAME, lmv->lmv_pool_name) + +/* master LMV is sane */ +static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv) + return false; + + if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1) + goto insane; + + if (le32_to_cpu(lmv->lmv_stripe_count) == 0) + goto insane; + + if (!lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_hash_type))) + goto insane; + + return true; +insane: + LMV_DEBUG(D_ERROR, lmv, "unknown layout"); + return false; +} + +/* LMV can be either master or stripe LMV */ +static inline bool lmv_is_sane2(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv) + return false; + + if (le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_V1 && + le32_to_cpu(lmv->lmv_magic) != LMV_MAGIC_STRIPE) + goto insane; + + if (le32_to_cpu(lmv->lmv_stripe_count) == 0) + goto insane; + + if (!lmv_is_known_hash_type(le32_to_cpu(lmv->lmv_hash_type))) + goto insane; + + return true; +insane: + LMV_DEBUG(D_ERROR, lmv, "unknown layout"); + return false; +} + +static inline bool lmv_is_splitting(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane2(lmv)) + return false; + + return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_merging(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane2(lmv)) + return false; + + return lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_migrating(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane(lmv)) + return false; + + return lmv_hash_is_migrating(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_restriping(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane2(lmv)) + return false; + + return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)) || + lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type)); } + +static inline bool lmv_is_layout_changing(const struct lmv_mds_md_v1 *lmv) +{ + if (!lmv_is_sane2(lmv)) + return false; + + return lmv_hash_is_splitting(cpu_to_le32(lmv->lmv_hash_type)) || + lmv_hash_is_merging(cpu_to_le32(lmv->lmv_hash_type)) || + lmv_hash_is_migrating(cpu_to_le32(lmv->lmv_hash_type)); +} + +static inline bool lmv_is_fixed(const struct lmv_mds_md_v1 *lmv) +{ + return cpu_to_le32(lmv->lmv_hash_type) & LMV_HASH_FLAG_FIXED; +} + #endif