Whamcloud - gitweb
LU-15720 dne: add crush2 hash type
authorAndreas Dilger <adilger@whamcloud.com>
Tue, 12 Apr 2022 23:18:10 +0000 (17:18 -0600)
committerAndreas Dilger <adilger@whamcloud.com>
Thu, 6 Jun 2024 08:16:39 +0000 (08:16 +0000)
The original "crush" hash type has a significant error with files
that have all-number suffixes, or suffixes that have non-alpha
characters in them.  These files will all be placed on the same
MDT as the base filename, which causes MDT imbalance.

Add a "crush2" hash type that has more stringent checks for the
suffix, so that it doesn't consider all-digit suffixes, or files
that only have a '.' at the right offset, as temporary files.

Test that the "broken" all-digit or extra-'.' filenames are hashed
properly with "crush2".  We also need to confirm that the old "crush"
hash has not changed (for name lookup compatibility) and still has
the original "bad hashing" bug that puts all files on the same MDT.

Fix handling of types beyond MDT_HASH_TYPE_CRUSH when creating dirs.

Fix debug layout printing of hash_type in more parts of the code.
Don't flood console if hash type is unrecognized in the future.

Lustre-change: https://review.whamcloud.com/47015
Lustre-commit: 1ac4b9598ad6e2f94c4c672b4733186364255c6a

Lustre-change: https://review.whamcloud.com/48713
Lustre-commit: e17471792388e59f44040d48dd8138ec865663af

Fixes: 0a1cf8da8069 ("LU-11025 dne: introduce new directory hash type 'crush'")
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: I1ce34b8f3af44432f55307ebc6906677c6179d1d
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/54925
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
15 files changed:
lustre/include/lu_object.h
lustre/include/lustre_lmv.h
lustre/include/obd_support.h
lustre/include/uapi/linux/lustre/lustre_idl.h
lustre/include/uapi/linux/lustre/lustre_user.h
lustre/llite/dir.c
lustre/lmv/lmv_obd.c
lustre/lod/lod_object.c
lustre/lod/lproc_lod.c
lustre/mdd/mdd_dir.c
lustre/mdt/mdt_reint.c
lustre/mdt/mdt_restripe.c
lustre/tests/sanity.sh
lustre/tests/test-framework.sh
lustre/utils/lfs.c

index 6a696a4..b284474 100644 (file)
@@ -1357,8 +1357,29 @@ static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname)
        return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen);
 }
 
+/**
+ * Determine if filename should be considered a "temporary" name.
+ *
+ * For temporary names, use only the main part of the filename and ignore
+ * the suffix, so that the filename will hash to the same MDT after it is
+ * renamed.  That avoids creating spurious remote entries for rsync, dcp,
+ * vi, and other tools that create a temporary name before renaming the file.
+ *
+ * The "CRUSH" and "CRUSH2" hash types are slightly different, and should
+ * not be modified without introducing a new hash type.  The hash algorithm
+ * forms an important part of the network protocol for striped directories,
+ * so if the hash function were "fixed" in any way it would prevent clients
+ * from looking up a filename on the right MDT.  LU-15692.
+ *
+ * \param[in] name             filename
+ * \param[in] namelen          length of @name
+ * \param[in] dot_prefix       if @name needs a leading '.' to be temporary
+ * \param[in] suffixlen                number of characters after '.' in @name to check
+ * \param[in] crush2           whether CRUSH or CRUSH2 heuristic should be used
+ */
 static inline bool lu_name_is_temp_file(const char *name, int namelen,
-                                       bool dot_prefix, int suffixlen)
+                                       bool dot_prefix, int suffixlen,
+                                       bool crush2)
 {
        int lower = 0;
        int upper = 0;
@@ -1372,21 +1393,46 @@ static inline bool lu_name_is_temp_file(const char *name, int namelen,
            name[namelen - suffixlen - 1] != '.')
                return false;
 
+       /* Any non-alphanumeric chars in the suffix for CRUSH2 mean the
+        * filename is *not* temporary.  The original CRUSH was incorrectly
+        * matching if a '.' happens to be in the right place, for example
+        * file.mdtest.12.12345 or output.6334.log, which is bad.  LU-15692
+        */
        while (len) {
-               lower += islower(name[namelen - len]);
-               upper += isupper(name[namelen - len]);
-               digit += isdigit(name[namelen - len]);
+               if (islower(name[namelen - len]))
+                       lower++;
+               else if (isupper(name[namelen - len]))
+                       upper++;
+               else if (isdigit(name[namelen - len]))
+                       digit++;
+               else if (crush2)
+                       return false;
                len--;
        }
-       /* mktemp() filename suffixes will have a mix of upper- and lower-case
-        * letters and/or numbers, not all numbers, or all upper or lower-case.
-        * About 0.07% of randomly-generated names will slip through,
+
+       /* mktemp() suffixes normally have a mix of upper- and lower-case
+        * letters and/or digits, rarely all upper- or lower-case or digits.
+        * Random all-digit suffixes are rare (1/45k for suffixlen=6), but
+        * common in normal usage (incrementing versions, dates, ranks, etc),
+        * so are considered non-temporary even if 1 or 2 non-numeric chars.
+        *
+        * About 0.07% of randomly-generated names will slip through, which
+        * only means that they may be renamed to a different MDT (slowdown),
         * but this avoids 99.93% of cross-MDT renames for those files.
         */
-       if ((digit >= suffixlen - 1 && !isdigit(name[namelen - suffixlen])) ||
-           upper == suffixlen || lower == suffixlen)
+       if (upper == suffixlen || lower == suffixlen)
                return false;
 
+       if (crush2) {
+               if (digit >= suffixlen - 1 &&
+                   isdigit(name[namelen - suffixlen]))
+                       return false;
+       } else { /* old crush incorrectly returns "true" for all-digit suffix */
+               if (digit >= suffixlen - 1 &&
+                   !isdigit(name[namelen - suffixlen]))
+                       return false;
+       }
+
        return true;
 }
 
index e6c0c92..cfe7868 100644 (file)
@@ -165,7 +165,7 @@ lmv_stripe_object_dump(int mask, const struct lmv_stripe_object *lsmo)
 
        CDEBUG_LIMIT(mask,
               "dump LMV: refs %u magic=%#x count=%u index=%u hash=%s:%#x max_inherit=%hhu max_inherit_rr=%hhu version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n",
-              lsm->lsm_md_magic, atomic_read(&lsmo->lso_refs),
+              atomic_read(&lsmo->lso_refs), lsm->lsm_md_magic,
               lsm->lsm_md_stripe_count, lsm->lsm_md_master_mdt_index,
               lmv_is_known_hash_type(lsm->lsm_md_hash_type) ?
                mdt_hash_name[lsm->lsm_md_hash_type & LMV_HASH_TYPE_MASK] :
@@ -310,7 +310,7 @@ static inline __u32 crush_hash(__u32 a, __u32 b)
  * algorithm.
  */
 static inline unsigned int
-lmv_hash_crush(unsigned int count, const char *name, int namelen)
+lmv_hash_crush(unsigned int count, const char *name, int namelen, bool crush2)
 {
        unsigned long long straw;
        unsigned long long highest_straw = 0;
@@ -323,10 +323,10 @@ lmv_hash_crush(unsigned int count, const char *name, int namelen)
         * 1. rsync: .<target>.XXXXXX
         * 2. dstripe: <target>.XXXXXXXX
         */
-       if (lu_name_is_temp_file(name, namelen, true, 6)) {
+       if (lu_name_is_temp_file(name, namelen, true, 6, crush2)) {
                name++;
                namelen -= 8;
-       } else if (lu_name_is_temp_file(name, namelen, false, 8)) {
+       } else if (lu_name_is_temp_file(name, namelen, false, 8, crush2)) {
                namelen -= 9;
        } else if (lu_name_is_backup_file(name, namelen, &i)) {
                LASSERT(i < namelen);
@@ -405,7 +405,11 @@ __lmv_name_to_stripe_index(__u32 hash_type, __u32 stripe_count,
                        break;
                case LMV_HASH_TYPE_CRUSH:
                        stripe_index = lmv_hash_crush(stripe_count, name,
-                                                     namelen);
+                                                     namelen, false);
+                       break;
+               case LMV_HASH_TYPE_CRUSH2:
+                       stripe_index = lmv_hash_crush(stripe_count, name,
+                                                     namelen, true);
                        break;
                default:
                        return -EBADFD;
@@ -479,6 +483,20 @@ static inline bool lmv_user_magic_supported(__u32 lum_magic)
               lum_magic == LMV_MAGIC_FOREIGN;
 }
 
+#define LMV_DEBUG(mask, lmv, msg)                                            \
+       CDEBUG_LIMIT(mask,                                                    \
+              "%s LMV: magic=%#x count=%u index=%u hash=%s:%#x version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n",\
+              msg, (lmv)->lmv_magic, (lmv)->lmv_stripe_count,                \
+              (lmv)->lmv_master_mdt_index,                                   \
+              lmv_is_known_hash_type((lmv)->lmv_hash_type) ?                 \
+               mdt_hash_name[(lmv)->lmv_hash_type & LMV_HASH_TYPE_MASK] :    \
+               "invalid", (lmv)->lmv_hash_type,                              \
+              (lmv)->lmv_layout_version, (lmv)->lmv_migrate_offset,          \
+              lmv_is_known_hash_type((lmv)->lmv_migrate_hash) ?              \
+               mdt_hash_name[(lmv)->lmv_migrate_hash & LMV_HASH_TYPE_MASK] : \
+               "invalid", (lmv)->lmv_migrate_hash,                           \
+              LOV_MAXPOOLNAME, lmv->lmv_pool_name)
+
 /* master LMV is sane */
 static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv)
 {
@@ -496,7 +514,7 @@ static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv)
 
        return true;
 insane:
-       LMV_DEBUG(D_ERROR, lmv, "insane");
+       LMV_DEBUG(D_ERROR, lmv, "unknown layout");
        return false;
 }
 
@@ -518,7 +536,7 @@ static inline bool lmv_is_sane2(const struct lmv_mds_md_v1 *lmv)
 
        return true;
 insane:
-       LMV_DEBUG(D_ERROR, lmv, "insane");
+       LMV_DEBUG(D_ERROR, lmv, "unknown layout");
        return false;
 }
 
index 1ad72a7..12227bb 100644 (file)
@@ -715,7 +715,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MIGRATE_BAD_HASH              0x1802
 
 /* LMV */
-#define OBD_FAIL_UNKNOWN_LMV_STRIPE            0x1901
+#define OBD_FAIL_LMV_UNKNOWN_STRIPE            0x1901
 
 /* FLR */
 #define OBD_FAIL_FLR_LV_DELAY                  0x1A01
index 0b92d21..2c742d7 100644 (file)
@@ -2313,14 +2313,6 @@ struct lmv_mds_md_v1 {
        struct lu_fid lmv_stripe_fids[0];       /* FIDs for each stripe */
 };
 
-#define LMV_DEBUG(mask, lmv, msg)                                      \
-       CDEBUG(mask,                                                    \
-              "%s LMV: magic=%#x count=%u index=%u hash=%#x version=%u migrate offset=%u migrate hash=%u.\n",  \
-              msg, (lmv)->lmv_magic, (lmv)->lmv_stripe_count,          \
-              (lmv)->lmv_master_mdt_index, (lmv)->lmv_hash_type,       \
-              (lmv)->lmv_layout_version, (lmv)->lmv_migrate_offset,    \
-              (lmv)->lmv_migrate_hash)
-
 /* stripe count before directory split */
 #define lmv_split_offset       lmv_migrate_offset
 /* stripe count after directory merge */
index 17e1906..c033fe7 100644 (file)
@@ -1074,10 +1074,12 @@ struct lmv_user_mds_data {
 
 enum lmv_hash_type {
        LMV_HASH_TYPE_UNKNOWN   = 0,    /* 0 is reserved for testing purpose */
-       LMV_HASH_TYPE_ALL_CHARS = 1,
-       LMV_HASH_TYPE_FNV_1A_64 = 2,
-       LMV_HASH_TYPE_CRUSH     = 3,
+       LMV_HASH_TYPE_ALL_CHARS = 1,    /* simple sum of characters */
+       LMV_HASH_TYPE_FNV_1A_64 = 2,    /* reasonable non-cryptographic hash */
+       LMV_HASH_TYPE_CRUSH     = 3,    /* double-hash to optimize migration */
+       LMV_HASH_TYPE_CRUSH2    = 4,    /* CRUSH with small fixes, LU-15692 */
        LMV_HASH_TYPE_MAX,
+       LMV_HASH_TYPE_DEFAULT   = LMV_HASH_TYPE_FNV_1A_64
 };
 
 static __attribute__((unused)) const char *mdt_hash_name[] = {
@@ -1085,9 +1087,9 @@ static __attribute__((unused)) const char *mdt_hash_name[] = {
        "all_char",
        "fnv_1a_64",
        "crush",
+       "crush2",
 };
 
-#define LMV_HASH_TYPE_DEFAULT LMV_HASH_TYPE_FNV_1A_64
 
 /* Right now only the lower part(0-16bits) of lmv_hash_type is being used,
  * and the higher part will be the flag to indicate the status of object,
@@ -1097,9 +1099,8 @@ static __attribute__((unused)) const char *mdt_hash_name[] = {
 
 static inline bool lmv_is_known_hash_type(__u32 type)
 {
-       return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
-              (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS ||
-              (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_CRUSH;
+       return (type & LMV_HASH_TYPE_MASK) > LMV_HASH_TYPE_UNKNOWN &&
+              (type & LMV_HASH_TYPE_MASK) < LMV_HASH_TYPE_MAX;
 }
 
 /* This flag indicates that overstriping (>1 stripe per MDT) is desired */
index e58712f..a347bef 100644 (file)
@@ -467,9 +467,10 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 
        if (lump->lum_magic != LMV_MAGIC_FOREIGN) {
                CDEBUG(D_VFSTRACE,
-                      "VFS Op:inode="DFID"(%p) name %s stripe_offset %d, stripe_count: %u\n",
+                      "VFS Op:inode="DFID"(%p) name=%s stripe_offset=%d stripe_count=%u, hash_type=%x\n",
                       PFID(ll_inode2fid(parent)), parent, dirname,
-                      (int)lump->lum_stripe_offset, lump->lum_stripe_count);
+                      (int)lump->lum_stripe_offset, lump->lum_stripe_count,
+                      lump->lum_hash_type);
        } else {
                struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lump;
 
@@ -490,7 +491,9 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
        /* MDS < 2.14 doesn't support 'crush' hash type, and cannot handle
         * unknown hash if client doesn't set a valid one. switch to fnv_1a_64.
         */
-       if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_CRUSH)) {
+       if (CFS_FAIL_CHECK(OBD_FAIL_LMV_UNKNOWN_STRIPE)) {
+               lump->lum_hash_type = cfs_fail_val;
+       } else if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_CRUSH)) {
                enum lmv_hash_type type = lump->lum_hash_type &
                                          LMV_HASH_TYPE_MASK;
 
index 0a4e831..fe07a6a 100644 (file)
@@ -3588,8 +3588,8 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
        lsm->lsm_md_magic = le32_to_cpu(lmm1->lmv_magic);
        lsm->lsm_md_stripe_count = le32_to_cpu(lmm1->lmv_stripe_count);
        lsm->lsm_md_master_mdt_index = le32_to_cpu(lmm1->lmv_master_mdt_index);
-       if (OBD_FAIL_CHECK(OBD_FAIL_UNKNOWN_LMV_STRIPE))
-               lsm->lsm_md_hash_type = LMV_HASH_TYPE_UNKNOWN;
+       if (CFS_FAIL_CHECK(OBD_FAIL_LMV_UNKNOWN_STRIPE))
+               lsm->lsm_md_hash_type = cfs_fail_val ?: LMV_HASH_TYPE_UNKNOWN;
        else
                lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type);
        lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version);
index d340d2d..575e459 100644 (file)
@@ -2454,16 +2454,19 @@ static int lod_declare_xattr_set_lmv(const struct lu_env *env,
                                     struct dt_object_format *dof,
                                     struct thandle *th)
 {
-       struct lod_object       *lo = lod_dt_obj(dt);
-       struct lmv_user_md_v1   *lum = lum_buf->lb_buf;
-       int                     rc;
-       ENTRY;
+       struct lod_object *lo = lod_dt_obj(dt);
+       struct lmv_user_md_v1 *lum = lum_buf->lb_buf;
+       int rc;
 
+       ENTRY;
        LASSERT(lum != NULL);
 
-       CDEBUG(D_INFO, "lum magic = %x count = %u offset = %d\n",
-              le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_stripe_count),
-              (int)le32_to_cpu(lum->lum_stripe_offset));
+       CDEBUG(D_INFO,
+              "lum magic=%x hash=%x count=%u offset=%d inherit=%u rr=%u\n",
+              le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_hash_type),
+              le32_to_cpu(lum->lum_stripe_count),
+              (int)le32_to_cpu(lum->lum_stripe_offset),
+              lum->lum_max_inherit, lum->lum_max_inherit_rr);
 
        if (lo->ldo_dir_stripe_count == 0) {
                if (lo->ldo_is_foreign) {
@@ -2494,7 +2497,7 @@ out:
  *
  * \param[in] env      execution environment
  * \param[in] dt       target object
- * \param[in] buf      LMV buf which contains source stripe fids
+ * \param[in] lmv_buf  LMV buf which contains source stripe FIDs
  * \param[in] fl       set or replace
  * \param[in] th       transaction handle
  *
@@ -2503,14 +2506,14 @@ out:
  */
 static int lod_dir_layout_set(const struct lu_env *env,
                              struct dt_object *dt,
-                             const struct lu_buf *buf,
+                             const struct lu_buf *lmv_buf,
                              int fl,
                              struct thandle *th)
 {
        struct dt_object *next = dt_object_child(dt);
        struct lod_object *lo = lod_dt_obj(dt);
        struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
-       struct lmv_mds_md_v1 *lmv = buf->lb_buf;
+       struct lmv_mds_md_v1 *lmv = lmv_buf->lb_buf;
        struct lmv_mds_md_v1 *slave_lmv;
        struct lu_buf slave_buf;
        int i;
@@ -2530,7 +2533,7 @@ static int lod_dir_layout_set(const struct lu_env *env,
 
        LMV_DEBUG(D_INFO, lmv, "set");
 
-       rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LMV, fl, th);
+       rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV, fl, th);
        if (rc)
                RETURN(rc);
 
@@ -5364,8 +5367,9 @@ static void lod_striping_from_default(struct lod_object *lo,
                        struct lod_layout_component *def_comp =
                                                &lds->lds_def_comp_entries[i];
 
-                       CDEBUG(D_LAYOUT, "Inherit from default: flags=%#x "
-                              "size=%hu nr=%u offset=%u pattern=%#x pool=%s\n",
+                       CDEBUG(D_LAYOUT,
+                              "inherit "DFID" file layout from default: flags=%#x size=%hu nr=%u offset=%u pattern=%#x pool=%s\n",
+                              PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
                               def_comp->llc_flags,
                               def_comp->llc_stripe_size,
                               def_comp->llc_stripe_count,
@@ -5414,11 +5418,12 @@ static void lod_striping_from_default(struct lod_object *lo,
                if (lo->ldo_dir_stripe_offset == -1)
                        lo->ldo_dir_stripe_offset =
                                lds->lds_dir_def_stripe_offset;
-               if (lo->ldo_dir_hash_type == 0)
+               if (lo->ldo_dir_hash_type == LMV_HASH_TYPE_UNKNOWN)
                        lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type;
 
-               CDEBUG(D_LAYOUT, "striping from default dir: count:%hu, "
-                      "offset:%u, hash_type:%u\n",
+               CDEBUG(D_LAYOUT,
+                      "inherit "DFID" dir layout from default: count=%hu offset=%u hash_type=%x\n",
+                      PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
                       lo->ldo_dir_stripe_count, lo->ldo_dir_stripe_offset,
                       lo->ldo_dir_hash_type);
        }
@@ -5457,8 +5462,8 @@ static inline bool lod_need_inherit_more(struct lod_object *lo, bool from_root,
  * This method is used to make a decision on the striping configuration for the
  * object being created. It can be taken from the \a parent object if it exists,
  * or filesystem's default. The resulting configuration (number of stripes,
- * stripe size/offset, pool name, etc) is stored in the object itself and will
- * be used by the methods like ->doo_declare_create().
+ * stripe size/offset, pool name, hash_type, etc.) is stored in the object
+ * itself and will be used by the methods like ->doo_declare_create().
  *
  * \see dt_object_operations::do_ah_init() in the API description for details.
  */
@@ -5555,7 +5560,7 @@ static void lod_ah_init(const struct lu_env *env,
                        lc->ldo_dir_hash_type =
                                le32_to_cpu(lum1->lum_hash_type);
                        CDEBUG(D_INFO,
-                              "set dirstripe: count %hu, offset %d, hash %u\n",
+                              "set dirstripe: count %hu, offset %d, hash %x\n",
                                lc->ldo_dir_stripe_count,
                                (int)lc->ldo_dir_stripe_offset,
                                lc->ldo_dir_hash_type);
@@ -5637,8 +5642,9 @@ static void lod_ah_init(const struct lu_env *env,
                                lc->ldo_dir_stripe_count = 0;
                }
 
-               if (!(lc->ldo_dir_hash_type & LMV_HASH_TYPE_MASK))
-                       lc->ldo_dir_hash_type |=
+               if (!lmv_is_known_hash_type(lc->ldo_dir_hash_type))
+                       lc->ldo_dir_hash_type =
+                               (lc->ldo_dir_hash_type & LMV_HASH_FLAG_KNOWN) |
                                d->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
 
                /* make sure all fscrypt metadata stays on same mdt */
index 604034d..7767244 100644 (file)
@@ -1102,24 +1102,30 @@ static ssize_t mdt_hash_store(struct kobject *kobj, struct attribute *attr,
        struct lod_device *lod = dt2lod_dev(dt);
        char *hash;
        int len;
+       int rc = -EINVAL;
        int i;
 
        hash = kstrndup(buffer, count, GFP_KERNEL);
        if (!hash)
                return -ENOMEM;
 
+       if (kstrtoint(hash, 10, &i) == 0 && lmv_is_known_hash_type(i)) {
+               lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern = i;
+               GOTO(out, rc = count);
+       }
+
        len = strcspn(hash, "\n ");
        hash[len] = '\0';
-       for (i = LMV_HASH_TYPE_ALL_CHARS; i < LMV_HASH_TYPE_MAX; i++) {
-               if (!strcmp(hash, mdt_hash_name[i])) {
+       for (i = LMV_HASH_TYPE_ALL_CHARS; i < ARRAY_SIZE(mdt_hash_name); i++) {
+               if (strcmp(hash, mdt_hash_name[i]) == 0) {
                        lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern = i;
-                       kfree(hash);
-                       return count;
+                       GOTO(out, rc = count);
                }
        }
+out:
        kfree(hash);
 
-       return -EINVAL;
+       return rc;
 }
 LUSTRE_RW_ATTR(mdt_hash);
 
index 172b453..4b1c550 100644 (file)
@@ -2280,13 +2280,14 @@ static int mdd_create_sanity_check(const struct lu_env *env,
                if (!lmv_user_magic_supported(le32_to_cpu(lum->lum_magic)) &&
                    le32_to_cpu(lum->lum_magic) != LMV_USER_MAGIC_V0) {
                        rc = -EINVAL;
-                       CERROR("%s: invalid lmv_user_md: magic = %x, "
-                              "stripe_offset = %d, stripe_count = %u: "
-                              "rc = %d\n", mdd2obd_dev(m)->obd_name,
-                               le32_to_cpu(lum->lum_magic),
+                       CERROR("%s: invalid lmv_user_md: magic=%x hash=%x stripe_offset=%d stripe_count=%u: rc = %d\n",
+                              mdd2obd_dev(m)->obd_name,
+                              le32_to_cpu(lum->lum_magic),
+                              le32_to_cpu(lum->lum_hash_type),
                               (int)le32_to_cpu(lum->lum_stripe_offset),
                               le32_to_cpu(lum->lum_stripe_count), rc);
-                       return rc;
+
+                       RETURN(rc);
                }
        }
 
index 9d17a16..317f7b4 100644 (file)
@@ -545,7 +545,7 @@ static int mdt_create(struct mdt_thread_info *info)
                }
 
                if ((!(exp_connect_flags2(exp) & OBD_CONNECT2_CRUSH)) &&
-                   (le32_to_cpu(lum->lum_hash_type) & LMV_HASH_TYPE_MASK) ==
+                   (le32_to_cpu(lum->lum_hash_type) & LMV_HASH_TYPE_MASK) >=
                    LMV_HASH_TYPE_CRUSH)
                        RETURN(-EPROTO);
 
index 4030db7..ec123a8 100644 (file)
@@ -607,12 +607,14 @@ static int mdt_restripe_migrate(struct mdt_thread_info *info)
        if ((lmv_is_splitting(lmv) &&
             idx >= le32_to_cpu(lmv->lmv_split_offset)) ||
            (lmv_is_merging(lmv) &&
-            (le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_TYPE_MASK) ==
-               LMV_HASH_TYPE_CRUSH &&
+            ((le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_TYPE_MASK) ==
+                                                LMV_HASH_TYPE_CRUSH ||
+             (le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_TYPE_MASK) ==
+                                                LMV_HASH_TYPE_CRUSH2) &&
             idx < le32_to_cpu(lmv->lmv_merge_offset))) {
                /* new stripes doesn't need to migrate sub files in dir
                 * split, neither for target stripes in dir merge if hash type
-                * is CRUSH.
+                * is CRUSH or CRUSH2.
                 */
                rc = mdt_restripe_migrate_finish(info, stripe, lmv);
                RETURN(rc);
index b54f58d..c9b0e30 100644 (file)
@@ -3012,6 +3012,8 @@ test_27K() {
                $DIR/$tdir/${tdir}2 ||
                error "$DIR/$tdir/${tdir}2: create failed"
 
+       $LFS getdirstripe -v $DIR/$tdir/${tdir}2
+
        $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
                grep "lfm_magic:.*0x0CD50CD0" ||
                error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
@@ -4210,45 +4212,124 @@ test_33g() {
 }
 run_test 33g "nonroot user create already existing root created file"
 
-test_33h() {
-       [ $MDSCOUNT -lt 2 ] && skip_env "needs >= 2 MDTs"
-       [ $MDS1_VERSION -lt $(version_code 2.13.50) ] &&
-               skip "Need MDS version at least 2.13.50"
+sub_33h() {
+       local hash_type=$1
+       local count=250
 
-       test_mkdir -c $MDSCOUNT -H crush $DIR/$tdir ||
-               error "mkdir $tdir failed"
+       test_mkdir -c $MDSCOUNT -H $hash_type $DIR/$tdir ||
+               error "lfs mkdir -H $hash_type $tdir failed"
        touch $DIR/$tdir/$tfile || error "touch $tfile failed"
 
        local index=$($LFS getstripe -m $DIR/$tdir/$tfile)
        local index2
+       local fname
 
        for fname in $DIR/$tdir/$tfile.bak \
                     $DIR/$tdir/$tfile.SAV \
                     $DIR/$tdir/$tfile.orig \
                     $DIR/$tdir/$tfile~; do
-               touch $fname  || error "touch $fname failed"
+               touch $fname || error "touch $fname failed"
                index2=$($LFS getstripe -m $fname)
-               [ $index -eq $index2 ] ||
+               (( $index == $index2 )) ||
                        error "$fname MDT index mismatch $index != $index2"
        done
 
        local failed=0
-       for i in {1..250}; do
-               for fname in $(mktemp -u $DIR/$tdir/.$tfile.XXXXXX) \
-                            $(mktemp $DIR/$tdir/$tfile.XXXXXXXX); do
-                       touch $fname  || error "touch $fname failed"
+       local patterns=(".$tfile.XXXXXX" "$tfile.XXXXXXXX")
+       local pattern
+
+       for pattern in ${patterns[*]}; do
+               echo "pattern $pattern"
+               fname=$DIR/$tdir/$pattern
+               for (( i = 0; i < $count; i++ )); do
+                       fname=$(mktemp $DIR/$tdir/$pattern) ||
+                               error "mktemp $DIR/$tdir/$pattern failed"
                        index2=$($LFS getstripe -m $fname)
-                       if [[ $index != $index2 ]]; then
-                               failed=$((failed + 1))
-                               echo "$fname MDT index mismatch $index != $index2"
-                       fi
+                       (( $index == $index2 )) && continue
+
+                       failed=$((failed + 1))
+                       echo "$fname MDT index mismatch $index != $index2"
+               done
+       done
+
+       echo "$failed/$count MDT index mismatches, expect ~2-4"
+       (( failed < 10 )) || error "MDT index mismatch $failed/$count times"
+
+       local same=0
+       local expect
+
+       # verify that "crush" is still broken with all files on same MDT,
+       # crush2 should have about 1/MDSCOUNT files on each MDT, with margin
+       [[ "$hash_type" == "crush" ]] && expect=$count ||
+               expect=$((count / MDSCOUNT))
+
+       # crush2 doesn't put all-numeric suffixes on the same MDT,
+       # filename like $tfile.12345678 should *not* be considered temp
+       for pattern in ${patterns[*]}; do
+               local base=${pattern%%X*}
+               local suff=${pattern#$base}
+
+               echo "pattern $pattern"
+               for (( i = 0; i < $count; i++ )); do
+                       fname=$DIR/$tdir/$base$((${suff//X/1} + i))
+                       touch $fname || error "touch $fname failed"
+                       index2=$($LFS getstripe -m $fname)
+                       (( $index != $index2 )) && continue
+
+                       same=$((same + 1))
+               done
+       done
+
+       # the number of "bad" hashes is random, as it depends on the random
+       # filenames generated by "mktemp".  Allow some margin in the results.
+       echo "$((same/${#patterns[*]}))/$count matches, expect ~$expect for $1"
+       (( same / ${#patterns[*]} <= expect * 9 / 7 &&
+          same / ${#patterns[*]} > expect * 5 / 7 )) ||
+               error "MDT index match $((same / ${#patterns[*]}))/$count times"
+       same=0
+
+       # crush2 doesn't put suffixes with special characters on the same MDT
+       # filename like $tfile.txt.1234 should *not* be considered temp
+       for pattern in ${patterns[*]}; do
+               local base=${pattern%%X*}
+               local suff=${pattern#$base}
+
+               pattern=$base...${suff/XXX}
+               echo "pattern=$pattern"
+               for (( i = 0; i < $count; i++ )); do
+                       fname=$(mktemp $DIR/$tdir/$pattern) ||
+                               error "touch $fname failed"
+                       index2=$($LFS getstripe -m $fname)
+                       (( $index != $index2 )) && continue
+
+                       same=$((same + 1))
                done
        done
-       echo "$failed MDT index mismatches"
-       (( failed < 20 )) || error "MDT index mismatch $failed times"
 
+       echo "$((same/${#patterns[*]}))/$count matches, expect ~$expect for $1"
+       (( same / ${#patterns[*]} < expect * 5 / 4 &&
+          same / ${#patterns[*]} > expect * 4 / 5 )) ||
+               error "MDT index match $((same / ${#patterns[*]}))/$count times"
+}
+
+test_33h() {
+       (( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
+       (( $MDS1_VERSION >= $(version_code 2.13.50) )) ||
+               skip "Need MDS version at least 2.13.50"
+
+       sub_33h crush
 }
-run_test 33h "temp file is located on the same MDT as target"
+run_test 33h "temp file is located on the same MDT as target (crush)"
+
+test_33hh() {
+       (( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
+       echo "MDS1_VERSION=$MDS1_VERSION version_code=$(version_code 2.14.0.144)"
+       (( $MDS1_VERSION >= $(version_code 2.14.0.144) )) ||
+               skip "Need MDS version at least 2.14.0.144 for crush2"
+
+       sub_33h crush2
+}
+run_test 33hh "temp file is located on the same MDT as target (crush2)"
 
 TEST_34_SIZE=${TEST_34_SIZE:-2000000000000}
 test_34a() {
@@ -24590,9 +24671,9 @@ test_300h() {
 run_test 300h "check default striped directory for striped directory"
 
 test_300i() {
-       [ $PARALLEL == "yes" ] && skip "skip parallel run"
-       [ $MDSCOUNT -lt 2 ] && skip_env "needs >= 2 MDTs"
-       [ $MDS1_VERSION -lt $(version_code 2.7.55) ] &&
+       [[ $PARALLEL == "yes" ]] && skip "skip parallel run"
+       (( $MDSCOUNT >= 2 )) || skip_env "needs >= 2 MDTs"
+       (( $MDS1_VERSION >= $(version_code 2.7.55) )) ||
                skip "Need MDS version at least 2.7.55"
 
        local stripe_count
@@ -24623,11 +24704,34 @@ test_300i() {
 
        $LFS find -H fnv_1a_64,crush $DIR/$tdir/hashdir
        local dircnt=$($LFS find -H fnv_1a_64,crush $DIR/$tdir/hashdir | wc -l)
-       [ $dircnt -eq 2 ] || error "lfs find striped dir got:$dircnt,except:1"
-
-       #set the stripe to be unknown hash type
-       #define OBD_FAIL_UNKNOWN_LMV_STRIPE     0x1901
-       $LCTL set_param fail_loc=0x1901
+       (( $dircnt == 2 )) || error "lfs find striped dir got $dircnt != 2"
+
+       if (( $MDS1_VERSION >= $(version_code 2.14.0.144) )); then
+               set_opencache 1
+               stack_trap "restore_opencache"
+               $LFS mkdir -i0 -c$MDSCOUNT -H crush2 $DIR/$tdir/hashdir/d3 ||
+                       error "create crush2 dir $tdir/hashdir/d3 failed"
+               $LFS find -H crush2 $DIR/$tdir/hashdir
+               dircnt=$($LFS find -H crush2 $DIR/$tdir/hashdir | wc -l)
+               (( $dircnt == 1 )) || error "find crush2 dir got $dircnt != 1"
+
+               # mkdir with an invalid hash type (hash=fail_val) from client
+               # should be replaced on MDS with a valid (default) hash type
+               #define OBD_FAIL_LMV_UNKNOWN_STRIPE     0x1901
+               $LCTL set_param fail_loc=0x1901 fail_val=99
+               $LFS mkdir -i 0 -c2 $DIR/$tdir/hashdir/d99
+
+               $LFS getdirstripe $DIR/$tdir/hashdir/d99
+               local hash=$($LFS getdirstripe -H $DIR/$tdir/hashdir/d99)
+               local expect=$(do_facet mds1 \
+                       $LCTL get_param -n lod.$FSNAME-MDT0000-mdtlov.mdt_hash)
+               [[ $hash == $expect ]] ||
+                       error "d99 hash '$hash' != expected hash '$expect'"
+       fi
+
+       #set the stripe to be unknown hash type on read
+       #define OBD_FAIL_LMV_UNKNOWN_STRIPE     0x1901
+       $LCTL set_param fail_loc=0x1901 fail_val=99
        for ((i = 0; i < 10; i++)); do
                $CHECKSTAT -t file $DIR/$tdir/striped_dir/f-$i ||
                        error "stat f-$i failed"
index 19735d2..f4184f0 100755 (executable)
@@ -484,7 +484,7 @@ check_cpt_number() {
 # code is useful for comparison two version strings to see which is newer.
 version_code() {
        # split arguments like "1.8.6-wc3" into "1", "8", "6", "3"
-       eval set -- $(tr "[:punct:][a-z]" " " <<< $*)
+       eval set -- $(tr "[:punct:][a-zA-Z]" " " <<< $*)
 
        echo -n $(((${1:-0}<<24) | (${2:-0}<<16) | (${3:-0}<<8) | (${4:-0})))
 }
@@ -9702,6 +9702,9 @@ test_mkdir() {
        local overstripe_count
        local stripe_command="-c"
 
+       (( $MDS1_VERSION > $(version_code 2.14.0.149) )) &&
+               hash_name+=("crush2")
+
        while getopts "c:C:H:i:p" opt; do
                case $opt in
                        c) dirstripe_count=$OPTARG;;
index 12e5326..9c98846 100644 (file)
@@ -658,11 +658,10 @@ static int check_hashtype(const char *hashtype)
        int i;
 
        /* numeric hash type */
-       if (hashtype && strlen(hashtype) == 1 &&
-           (type_num > 0 && type_num < LMV_HASH_TYPE_MAX))
+       if (hashtype && lmv_is_known_hash_type(type_num))
                return type_num;
        /* string hash type */
-       for (i = LMV_HASH_TYPE_ALL_CHARS; i < LMV_HASH_TYPE_MAX; i++)
+       for (i = LMV_HASH_TYPE_ALL_CHARS; i < ARRAY_SIZE(mdt_hash_name); i++)
                if (strcmp(hashtype, mdt_hash_name[i]) == 0)
                        return i;
 
@@ -1555,7 +1554,9 @@ static int mdthash_input(char *string, __u32 *inflags,
                __u32 flag;
        } mhflist[] = {
                {"migrating", LMV_HASH_FLAG_MIGRATION},
+               {"bad_type", LMV_HASH_FLAG_BAD_TYPE},
                {"badtype", LMV_HASH_FLAG_BAD_TYPE},
+               {"lost_lmv", LMV_HASH_FLAG_LOST_LMV},
                {"lostlmv", LMV_HASH_FLAG_LOST_LMV},
        };
 
@@ -7068,7 +7069,7 @@ static int lfs_setdirstripe(int argc, char **argv)
                        lsa.lsa_pattern = check_hashtype(optarg);
                        if (lsa.lsa_pattern == 0) {
                                fprintf(stderr,
-                                       "%s %s: bad stripe hash type '%s'\n",
+                                       "%s %s: bad directory hash type '%s'\n",
                                        progname, argv[0], optarg);
                                return CMD_HELP;
                        }