Whamcloud - gitweb
LU-15720 dne: add crush2 hash type 15/47015/8
authorAndreas Dilger <adilger@whamcloud.com>
Tue, 12 Apr 2022 23:18:10 +0000 (17:18 -0600)
committerOleg Drokin <green@whamcloud.com>
Mon, 11 Jul 2022 06:48:37 +0000 (06:48 +0000)
The original "crush" hash type has a significant error with files
that have all-number suffixes, or suffixes that have non-alpha
characters in them.  These files will all be placed on the same
MDT as the base filename, which causes MDT imbalance.

Add a "crush2" hash type that has more stringent checks for the
suffix, so that it doesn't consider all-digit suffixes, or files
that only have a '.' at the right offset, as temporary files.

Test that the "broken" all-digit or extra-'.' filenames are hashed
properly with "crush2".  We also need to confirm that the old "crush"
hash has not changed (for name lookup compatibility) and still has
the original "bad hashing" bug that puts all files on the same MDT.

Fix handling of types beyond MDT_HASH_TYPE_CRUSH when creating dirs.

Fix debug layout printing of hash_type in more parts of the code.
Don't flood console if hash type is unrecognized in the future.

Fixes: 0a1cf8da8069 ("LU-11025 dne: introduce new directory hash type 'crush'")
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: I1ce34b8f3af44432f55307ebc6906677c6179d1d
Reviewed-on: https://review.whamcloud.com/47015
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Shuichi Ihara <sihara@ddn.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
14 files changed:
lustre/include/lu_object.h
lustre/include/lustre_lmv.h
lustre/include/obd_support.h
lustre/include/uapi/linux/lustre/lustre_user.h
lustre/llite/dir.c
lustre/lmv/lmv_obd.c
lustre/lod/lod_object.c
lustre/lod/lproc_lod.c
lustre/mdd/mdd_dir.c
lustre/mdt/mdt_reint.c
lustre/mdt/mdt_restripe.c
lustre/tests/sanity.sh
lustre/tests/test-framework.sh
lustre/utils/lfs.c

index 9ceabaf..97dc3de 100644 (file)
@@ -1362,8 +1362,29 @@ static inline bool lu_name_is_dot_or_dotdot(const struct lu_name *lname)
        return name_is_dot_or_dotdot(lname->ln_name, lname->ln_namelen);
 }
 
+/**
+ * Determine if filename should be considered a "temporary" name.
+ *
+ * For temporary names, use only the main part of the filename and ignore
+ * the suffix, so that the filename will hash to the same MDT after it is
+ * renamed.  That avoids creating spurious remote entries for rsync, dcp,
+ * vi, and other tools that create a temporary name before renaming the file.
+ *
+ * The "CRUSH" and "CRUSH2" hash types are slightly different, and should
+ * not be modified without introducing a new hash type.  The hash algorithm
+ * forms an important part of the network protocol for striped directories,
+ * so if the hash function were "fixed" in any way it would prevent clients
+ * from looking up a filename on the right MDT.  LU-15692.
+ *
+ * \param[in] name             filename
+ * \param[in] namelen          length of @name
+ * \param[in] dot_prefix       if @name needs a leading '.' to be temporary
+ * \param[in] suffixlen                number of characters after '.' in @name to check
+ * \param[in] crush2           whether CRUSH or CRUSH2 heuristic should be used
+ */
 static inline bool lu_name_is_temp_file(const char *name, int namelen,
-                                       bool dot_prefix, int suffixlen)
+                                       bool dot_prefix, int suffixlen,
+                                       bool crush2)
 {
        int lower = 0;
        int upper = 0;
@@ -1377,21 +1398,46 @@ static inline bool lu_name_is_temp_file(const char *name, int namelen,
            name[namelen - suffixlen - 1] != '.')
                return false;
 
+       /* Any non-alphanumeric chars in the suffix for CRUSH2 mean the
+        * filename is *not* temporary.  The original CRUSH was incorrectly
+        * matching if a '.' happens to be in the right place, for example
+        * file.mdtest.12.12345 or output.6334.log, which is bad.  LU-15692
+        */
        while (len) {
-               lower += islower(name[namelen - len]);
-               upper += isupper(name[namelen - len]);
-               digit += isdigit(name[namelen - len]);
+               if (islower(name[namelen - len]))
+                       lower++;
+               else if (isupper(name[namelen - len]))
+                       upper++;
+               else if (isdigit(name[namelen - len]))
+                       digit++;
+               else if (crush2)
+                       return false;
                len--;
        }
-       /* mktemp() filename suffixes will have a mix of upper- and lower-case
-        * letters and/or numbers, not all numbers, or all upper or lower-case.
-        * About 0.07% of randomly-generated names will slip through,
+
+       /* mktemp() suffixes normally have a mix of upper- and lower-case
+        * letters and/or digits, rarely all upper- or lower-case or digits.
+        * Random all-digit suffixes are rare (1/45k for suffixlen=6), but
+        * common in normal usage (incrementing versions, dates, ranks, etc),
+        * so are considered non-temporary even if 1 or 2 non-numeric chars.
+        *
+        * About 0.07% of randomly-generated names will slip through, which
+        * only means that they may be renamed to a different MDT (slowdown),
         * but this avoids 99.93% of cross-MDT renames for those files.
         */
-       if ((digit >= suffixlen - 1 && !isdigit(name[namelen - suffixlen])) ||
-           upper == suffixlen || lower == suffixlen)
+       if (upper == suffixlen || lower == suffixlen)
                return false;
 
+       if (crush2) {
+               if (digit >= suffixlen - 1 &&
+                   isdigit(name[namelen - suffixlen]))
+                       return false;
+       } else { /* old crush incorrectly returns "true" for all-digit suffix */
+               if (digit >= suffixlen - 1 &&
+                   !isdigit(name[namelen - suffixlen]))
+                       return false;
+       }
+
        return true;
 }
 
index 2ffd77f..aafaed3 100644 (file)
@@ -124,21 +124,20 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 
 static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm)
 {
-       bool valid_hash = lmv_dir_bad_hash(lsm);
        int i;
 
-       /* If lsm_md_magic == LMV_MAGIC_FOREIGN pool_name may not be a null
-        * terminated string so only print LOV_MAXPOOLNAME bytes.
-        */
-       CDEBUG(mask,
-              "magic %#x stripe count %d master mdt %d hash type %s:%#x max-inherit %hhu max-inherit-rr %hhu version %d migrate offset %d migrate hash %#x pool %.*s\n",
+       CDEBUG_LIMIT(mask,
+              "dump LMV: magic=%#x count=%u index=%u hash=%s:%#x max_inherit=%hhu max_inherit_rr=%hhu version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n",
               lsm->lsm_md_magic, lsm->lsm_md_stripe_count,
               lsm->lsm_md_master_mdt_index,
-              valid_hash ? "invalid hash" :
-                           mdt_hash_name[lsm->lsm_md_hash_type & (LMV_HASH_TYPE_MAX - 1)],
-              lsm->lsm_md_hash_type, lsm->lsm_md_max_inherit,
-              lsm->lsm_md_max_inherit_rr, lsm->lsm_md_layout_version,
-              lsm->lsm_md_migrate_offset, lsm->lsm_md_migrate_hash,
+              lmv_is_known_hash_type(lsm->lsm_md_hash_type) ?
+               mdt_hash_name[lsm->lsm_md_hash_type & LMV_HASH_TYPE_MASK] :
+               "invalid", lsm->lsm_md_hash_type,
+              lsm->lsm_md_max_inherit, lsm->lsm_md_max_inherit_rr,
+              lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset,
+              lmv_is_known_hash_type(lsm->lsm_md_migrate_hash) ?
+               mdt_hash_name[lsm->lsm_md_migrate_hash & LMV_HASH_TYPE_MASK] :
+               "invalid", lsm->lsm_md_migrate_hash,
               LOV_MAXPOOLNAME, lsm->lsm_md_pool_name);
 
        if (!lmv_dir_striped(lsm))
@@ -248,7 +247,7 @@ static inline __u32 crush_hash(__u32 a, __u32 b)
  * algorithm.
  */
 static inline unsigned int
-lmv_hash_crush(unsigned int count, const char *name, int namelen)
+lmv_hash_crush(unsigned int count, const char *name, int namelen, bool crush2)
 {
        unsigned long long straw;
        unsigned long long highest_straw = 0;
@@ -261,10 +260,10 @@ lmv_hash_crush(unsigned int count, const char *name, int namelen)
         * 1. rsync: .<target>.XXXXXX
         * 2. dstripe: <target>.XXXXXXXX
         */
-       if (lu_name_is_temp_file(name, namelen, true, 6)) {
+       if (lu_name_is_temp_file(name, namelen, true, 6, crush2)) {
                name++;
                namelen -= 8;
-       } else if (lu_name_is_temp_file(name, namelen, false, 8)) {
+       } else if (lu_name_is_temp_file(name, namelen, false, 8, crush2)) {
                namelen -= 9;
        } else if (lu_name_is_backup_file(name, namelen, &i)) {
                LASSERT(i < namelen);
@@ -343,7 +342,11 @@ __lmv_name_to_stripe_index(__u32 hash_type, __u32 stripe_count,
                        break;
                case LMV_HASH_TYPE_CRUSH:
                        stripe_index = lmv_hash_crush(stripe_count, name,
-                                                     namelen);
+                                                     namelen, false);
+                       break;
+               case LMV_HASH_TYPE_CRUSH2:
+                       stripe_index = lmv_hash_crush(stripe_count, name,
+                                                     namelen, true);
                        break;
                default:
                        return -EBADFD;
@@ -415,16 +418,19 @@ static inline bool lmv_user_magic_supported(__u32 lum_magic)
               lum_magic == LMV_MAGIC_FOREIGN;
 }
 
-#define LMV_DEBUG(mask, lmv, msg)                                      \
-       CDEBUG(mask,                                                    \
-              "%s LMV: magic=%#x count=%u index=%u hash=%s:%#x version=%u migrate offset=%u migrate hash=%s:%u.\n",\
-              msg, (lmv)->lmv_magic, (lmv)->lmv_stripe_count,          \
-              (lmv)->lmv_master_mdt_index,                             \
-              mdt_hash_name[(lmv)->lmv_hash_type & (LMV_HASH_TYPE_MAX - 1)],\
-              (lmv)->lmv_hash_type, (lmv)->lmv_layout_version,         \
-              (lmv)->lmv_migrate_offset,                               \
-              mdt_hash_name[(lmv)->lmv_migrate_hash & (LMV_HASH_TYPE_MAX - 1)],\
-              (lmv)->lmv_migrate_hash)
+#define LMV_DEBUG(mask, lmv, msg)                                            \
+       CDEBUG_LIMIT(mask,                                                    \
+              "%s LMV: magic=%#x count=%u index=%u hash=%s:%#x version=%u migrate_offset=%u migrate_hash=%s:%x pool=%.*s\n",\
+              msg, (lmv)->lmv_magic, (lmv)->lmv_stripe_count,                \
+              (lmv)->lmv_master_mdt_index,                                   \
+              lmv_is_known_hash_type((lmv)->lmv_hash_type) ?                 \
+               mdt_hash_name[(lmv)->lmv_hash_type & LMV_HASH_TYPE_MASK] :    \
+               "invalid", (lmv)->lmv_hash_type,                              \
+              (lmv)->lmv_layout_version, (lmv)->lmv_migrate_offset,          \
+              lmv_is_known_hash_type((lmv)->lmv_migrate_hash) ?              \
+               mdt_hash_name[(lmv)->lmv_migrate_hash & LMV_HASH_TYPE_MASK] : \
+               "invalid", (lmv)->lmv_migrate_hash,                           \
+              LOV_MAXPOOLNAME, lmv->lmv_pool_name)
 
 /* master LMV is sane */
 static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv)
@@ -443,7 +449,7 @@ static inline bool lmv_is_sane(const struct lmv_mds_md_v1 *lmv)
 
        return true;
 insane:
-       LMV_DEBUG(D_ERROR, lmv, "insane");
+       LMV_DEBUG(D_ERROR, lmv, "unknown layout");
        return false;
 }
 
@@ -465,7 +471,7 @@ static inline bool lmv_is_sane2(const struct lmv_mds_md_v1 *lmv)
 
        return true;
 insane:
-       LMV_DEBUG(D_ERROR, lmv, "insane");
+       LMV_DEBUG(D_ERROR, lmv, "unknown layout");
        return false;
 }
 
index 2e2ad79..80afed0 100644 (file)
@@ -681,7 +681,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MIGRATE_ENTRIES               0x1801
 
 /* LMV */
-#define OBD_FAIL_UNKNOWN_LMV_STRIPE            0x1901
+#define OBD_FAIL_LMV_UNKNOWN_STRIPE            0x1901
 
 /* FLR */
 #define OBD_FAIL_FLR_LV_DELAY                  0x1A01
index edc4511..620e26a 100644 (file)
@@ -1013,10 +1013,12 @@ struct lmv_user_mds_data {
 
 enum lmv_hash_type {
        LMV_HASH_TYPE_UNKNOWN   = 0,    /* 0 is reserved for testing purpose */
-       LMV_HASH_TYPE_ALL_CHARS = 1,
-       LMV_HASH_TYPE_FNV_1A_64 = 2,
-       LMV_HASH_TYPE_CRUSH     = 3,
+       LMV_HASH_TYPE_ALL_CHARS = 1,    /* simple sum of characters */
+       LMV_HASH_TYPE_FNV_1A_64 = 2,    /* reasonable non-cryptographic hash */
+       LMV_HASH_TYPE_CRUSH     = 3,    /* double-hash to optimize migration */
+       LMV_HASH_TYPE_CRUSH2    = 4,    /* CRUSH with small fixes, LU-15692 */
        LMV_HASH_TYPE_MAX,
+       LMV_HASH_TYPE_DEFAULT   = LMV_HASH_TYPE_FNV_1A_64
 };
 
 static __attribute__((unused)) const char *mdt_hash_name[] = {
@@ -1024,9 +1026,9 @@ static __attribute__((unused)) const char *mdt_hash_name[] = {
        "all_char",
        "fnv_1a_64",
        "crush",
+       "crush2",
 };
 
-#define LMV_HASH_TYPE_DEFAULT LMV_HASH_TYPE_FNV_1A_64
 
 /* Right now only the lower part(0-16bits) of lmv_hash_type is being used,
  * and the higher part will be the flag to indicate the status of object,
@@ -1036,9 +1038,8 @@ static __attribute__((unused)) const char *mdt_hash_name[] = {
 
 static inline bool lmv_is_known_hash_type(__u32 type)
 {
-       return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
-              (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS ||
-              (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_CRUSH;
+       return (type & LMV_HASH_TYPE_MASK) > LMV_HASH_TYPE_UNKNOWN &&
+              (type & LMV_HASH_TYPE_MASK) < LMV_HASH_TYPE_MAX;
 }
 
 /* fixed layout, such directories won't split automatically */
index f6e4779..7f77d31 100644 (file)
@@ -469,9 +469,10 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
 
        if (lump->lum_magic != LMV_MAGIC_FOREIGN) {
                CDEBUG(D_VFSTRACE,
-                      "VFS Op:inode="DFID"(%p) name %s stripe_offset %d, stripe_count: %u\n",
+                      "VFS Op:inode="DFID"(%p) name=%s stripe_offset=%d stripe_count=%u, hash_type=%x\n",
                       PFID(ll_inode2fid(parent)), parent, dirname,
-                      (int)lump->lum_stripe_offset, lump->lum_stripe_count);
+                      (int)lump->lum_stripe_offset, lump->lum_stripe_count,
+                      lump->lum_hash_type);
        } else {
                struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lump;
 
@@ -492,7 +493,9 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump,
        /* MDS < 2.14 doesn't support 'crush' hash type, and cannot handle
         * unknown hash if client doesn't set a valid one. switch to fnv_1a_64.
         */
-       if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_CRUSH)) {
+       if (CFS_FAIL_CHECK(OBD_FAIL_LMV_UNKNOWN_STRIPE)) {
+               lump->lum_hash_type = cfs_fail_val;
+       } else if (!(exp_connect_flags2(sbi->ll_md_exp) & OBD_CONNECT2_CRUSH)) {
                enum lmv_hash_type type = lump->lum_hash_type &
                                          LMV_HASH_TYPE_MASK;
 
index 5b8c03d..1431ec4 100644 (file)
@@ -3343,8 +3343,8 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
        lsm->lsm_md_magic = le32_to_cpu(lmm1->lmv_magic);
        lsm->lsm_md_stripe_count = le32_to_cpu(lmm1->lmv_stripe_count);
        lsm->lsm_md_master_mdt_index = le32_to_cpu(lmm1->lmv_master_mdt_index);
-       if (OBD_FAIL_CHECK(OBD_FAIL_UNKNOWN_LMV_STRIPE))
-               lsm->lsm_md_hash_type = LMV_HASH_TYPE_UNKNOWN;
+       if (CFS_FAIL_CHECK(OBD_FAIL_LMV_UNKNOWN_STRIPE))
+               lsm->lsm_md_hash_type = cfs_fail_val ?: LMV_HASH_TYPE_UNKNOWN;
        else
                lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type);
        lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version);
index 504bb5f..019236e 100644 (file)
@@ -2396,16 +2396,19 @@ static int lod_declare_xattr_set_lmv(const struct lu_env *env,
                                     struct dt_object_format *dof,
                                     struct thandle *th)
 {
-       struct lod_object       *lo = lod_dt_obj(dt);
-       struct lmv_user_md_v1   *lum = lum_buf->lb_buf;
-       int                     rc;
-       ENTRY;
+       struct lod_object *lo = lod_dt_obj(dt);
+       struct lmv_user_md_v1 *lum = lum_buf->lb_buf;
+       int rc;
 
+       ENTRY;
        LASSERT(lum != NULL);
 
-       CDEBUG(D_INFO, "lum magic = %x count = %u offset = %d\n",
-              le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_stripe_count),
-              (int)le32_to_cpu(lum->lum_stripe_offset));
+       CDEBUG(D_INFO,
+              "lum magic=%x hash=%x count=%u offset=%d inherit=%u rr=%u\n",
+              le32_to_cpu(lum->lum_magic), le32_to_cpu(lum->lum_hash_type),
+              le32_to_cpu(lum->lum_stripe_count),
+              (int)le32_to_cpu(lum->lum_stripe_offset),
+              lum->lum_max_inherit, lum->lum_max_inherit_rr);
 
        if (lo->ldo_dir_stripe_count == 0) {
                if (lo->ldo_is_foreign) {
@@ -2436,7 +2439,7 @@ out:
  *
  * \param[in] env      execution environment
  * \param[in] dt       target object
- * \param[in] buf      LMV buf which contains source stripe fids
+ * \param[in] lmv_buf  LMV buf which contains source stripe FIDs
  * \param[in] fl       set or replace
  * \param[in] th       transaction handle
  *
@@ -2445,14 +2448,14 @@ out:
  */
 static int lod_dir_layout_set(const struct lu_env *env,
                              struct dt_object *dt,
-                             const struct lu_buf *buf,
+                             const struct lu_buf *lmv_buf,
                              int fl,
                              struct thandle *th)
 {
        struct dt_object *next = dt_object_child(dt);
        struct lod_object *lo = lod_dt_obj(dt);
        struct lod_device *lod = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
-       struct lmv_mds_md_v1 *lmv = buf->lb_buf;
+       struct lmv_mds_md_v1 *lmv = lmv_buf->lb_buf;
        struct lmv_mds_md_v1 *slave_lmv;
        struct lu_buf slave_buf;
        int i;
@@ -2472,7 +2475,7 @@ static int lod_dir_layout_set(const struct lu_env *env,
 
        LMV_DEBUG(D_INFO, lmv, "set");
 
-       rc = lod_sub_xattr_set(env, next, buf, XATTR_NAME_LMV, fl, th);
+       rc = lod_sub_xattr_set(env, next, lmv_buf, XATTR_NAME_LMV, fl, th);
        if (rc)
                RETURN(rc);
 
@@ -5551,8 +5554,9 @@ static void lod_striping_from_default(struct lod_object *lo,
                        struct lod_layout_component *def_comp =
                                                &lds->lds_def_comp_entries[i];
 
-                       CDEBUG(D_LAYOUT, "Inherit from default: flags=%#x "
-                              "size=%hu nr=%u offset=%u pattern=%#x pool=%s\n",
+                       CDEBUG(D_LAYOUT,
+                              "inherit "DFID" file layout from default: flags=%#x size=%hu nr=%u offset=%u pattern=%#x pool=%s\n",
+                              PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
                               def_comp->llc_flags,
                               def_comp->llc_stripe_size,
                               def_comp->llc_stripe_count,
@@ -5601,11 +5605,12 @@ static void lod_striping_from_default(struct lod_object *lo,
                if (lo->ldo_dir_stripe_offset == -1)
                        lo->ldo_dir_stripe_offset =
                                lds->lds_dir_def_stripe_offset;
-               if (lo->ldo_dir_hash_type == 0)
+               if (lo->ldo_dir_hash_type == LMV_HASH_TYPE_UNKNOWN)
                        lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type;
 
-               CDEBUG(D_LAYOUT, "striping from default dir: count:%hu, "
-                      "offset:%u, hash_type:%u\n",
+               CDEBUG(D_LAYOUT,
+                      "inherit "DFID" dir layout from default: count=%hu offset=%u hash_type=%x\n",
+                      PFID(lu_object_fid(&lo->ldo_obj.do_lu)),
                       lo->ldo_dir_stripe_count, lo->ldo_dir_stripe_offset,
                       lo->ldo_dir_hash_type);
        }
@@ -5644,8 +5649,8 @@ static inline bool lod_need_inherit_more(struct lod_object *lo, bool from_root,
  * This method is used to make a decision on the striping configuration for the
  * object being created. It can be taken from the \a parent object if it exists,
  * or filesystem's default. The resulting configuration (number of stripes,
- * stripe size/offset, pool name, etc) is stored in the object itself and will
- * be used by the methods like ->doo_declare_create().
+ * stripe size/offset, pool name, hash_type, etc.) is stored in the object
+ * itself and will be used by the methods like ->doo_declare_create().
  *
  * \see dt_object_operations::do_ah_init() in the API description for details.
  */
@@ -5725,7 +5730,7 @@ static void lod_ah_init(const struct lu_env *env,
                        lc->ldo_dir_hash_type =
                                le32_to_cpu(lum1->lum_hash_type);
                        CDEBUG(D_INFO,
-                              "set dirstripe: count %hu, offset %d, hash %u\n",
+                              "set dirstripe: count %hu, offset %d, hash %x\n",
                                lc->ldo_dir_stripe_count,
                                (int)lc->ldo_dir_stripe_offset,
                                lc->ldo_dir_hash_type);
@@ -5803,11 +5808,12 @@ static void lod_ah_init(const struct lu_env *env,
                                lc->ldo_dir_stripe_count = 0;
                }
 
-               if (!(lc->ldo_dir_hash_type & LMV_HASH_TYPE_MASK))
-                       lc->ldo_dir_hash_type |=
+               if (!lmv_is_known_hash_type(lc->ldo_dir_hash_type))
+                       lc->ldo_dir_hash_type =
+                               (lc->ldo_dir_hash_type & LMV_HASH_FLAG_KNOWN) |
                                d->lod_mdt_descs.ltd_lmv_desc.ld_pattern;
 
-               CDEBUG(D_INFO, "final dir stripe [%hu %d %u]\n",
+               CDEBUG(D_INFO, "final dir stripe_count=%hu offset=%d hash=%u\n",
                       lc->ldo_dir_stripe_count,
                       (int)lc->ldo_dir_stripe_offset, lc->ldo_dir_hash_type);
 
index de44d74..363b3a7 100644 (file)
@@ -1071,24 +1071,30 @@ static ssize_t mdt_hash_store(struct kobject *kobj, struct attribute *attr,
        struct lod_device *lod = dt2lod_dev(dt);
        char *hash;
        int len;
+       int rc = -EINVAL;
        int i;
 
        hash = kstrndup(buffer, count, GFP_KERNEL);
        if (!hash)
                return -ENOMEM;
 
+       if (kstrtoint(hash, 10, &i) == 0 && lmv_is_known_hash_type(i)) {
+               lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern = i;
+               GOTO(out, rc = count);
+       }
+
        len = strcspn(hash, "\n ");
        hash[len] = '\0';
-       for (i = LMV_HASH_TYPE_ALL_CHARS; i < LMV_HASH_TYPE_MAX; i++) {
-               if (!strcmp(hash, mdt_hash_name[i])) {
+       for (i = LMV_HASH_TYPE_ALL_CHARS; i < ARRAY_SIZE(mdt_hash_name); i++) {
+               if (strcmp(hash, mdt_hash_name[i]) == 0) {
                        lod->lod_mdt_descs.ltd_lmv_desc.ld_pattern = i;
-                       kfree(hash);
-                       return count;
+                       GOTO(out, rc = count);
                }
        }
+out:
        kfree(hash);
 
-       return -EINVAL;
+       return rc;
 }
 LUSTRE_RW_ATTR(mdt_hash);
 
index 2d32826..1250bd1 100644 (file)
@@ -2098,13 +2098,14 @@ static int mdd_create_sanity_check(const struct lu_env *env,
                if (!lmv_user_magic_supported(le32_to_cpu(lum->lum_magic)) &&
                    le32_to_cpu(lum->lum_magic) != LMV_USER_MAGIC_V0) {
                        rc = -EINVAL;
-                       CERROR("%s: invalid lmv_user_md: magic = %x, "
-                              "stripe_offset = %d, stripe_count = %u: "
-                              "rc = %d\n", mdd2obd_dev(m)->obd_name,
-                               le32_to_cpu(lum->lum_magic),
+                       CERROR("%s: invalid lmv_user_md: magic=%x hash=%x stripe_offset=%d stripe_count=%u: rc = %d\n",
+                              mdd2obd_dev(m)->obd_name,
+                              le32_to_cpu(lum->lum_magic),
+                              le32_to_cpu(lum->lum_hash_type),
                               (int)le32_to_cpu(lum->lum_stripe_offset),
                               le32_to_cpu(lum->lum_stripe_count), rc);
-                       return rc;
+
+                       RETURN(rc);
                }
        }
 
index c800ca6..3f1eca6 100644 (file)
@@ -539,7 +539,7 @@ static int mdt_create(struct mdt_thread_info *info)
                }
 
                if ((!(exp_connect_flags2(exp) & OBD_CONNECT2_CRUSH)) &&
-                   (le32_to_cpu(lum->lum_hash_type) & LMV_HASH_TYPE_MASK) ==
+                   (le32_to_cpu(lum->lum_hash_type) & LMV_HASH_TYPE_MASK) >=
                    LMV_HASH_TYPE_CRUSH)
                        RETURN(-EPROTO);
 
index 22559f5..07fe836 100644 (file)
@@ -607,12 +607,14 @@ static int mdt_restripe_migrate(struct mdt_thread_info *info)
        if ((lmv_is_splitting(lmv) &&
             idx >= le32_to_cpu(lmv->lmv_split_offset)) ||
            (lmv_is_merging(lmv) &&
-            (le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_TYPE_MASK) ==
-               LMV_HASH_TYPE_CRUSH &&
+            ((le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_TYPE_MASK) ==
+                                                LMV_HASH_TYPE_CRUSH ||
+             (le32_to_cpu(lmv->lmv_hash_type) & LMV_HASH_TYPE_MASK) ==
+                                                LMV_HASH_TYPE_CRUSH2) &&
             idx < le32_to_cpu(lmv->lmv_merge_offset))) {
                /* new stripes doesn't need to migrate sub files in dir
                 * split, neither for target stripes in dir merge if hash type
-                * is CRUSH.
+                * is CRUSH or CRUSH2.
                 */
                rc = mdt_restripe_migrate_finish(info, stripe, lmv);
                RETURN(rc);
index 4f5ff0f..5422fb5 100755 (executable)
@@ -2894,6 +2894,8 @@ test_27K() {
                $DIR/$tdir/${tdir}2 ||
                error "$DIR/$tdir/${tdir}2: create failed"
 
+       $LFS getdirstripe -v $DIR/$tdir/${tdir}2
+
        $LFS getdirstripe -v $DIR/$tdir/${tdir}2 |
                grep "lfm_magic:.*0x0CD50CD0" ||
                error "$DIR/$tdir/${tdir}2: invalid LMV EA magic"
@@ -3235,6 +3237,8 @@ test_27P() {
                --flags=0xda05 --mode 0750 $DIR/$tdir/${tdir} ||
                error "$DIR/$tdir/${tdir}: create failed"
 
+       $LFS getdirstripe -v $DIR/$tdir/${tdir}
+
        $LFS getdirstripe -v $DIR/$tdir/${tdir} |
                grep "lfm_magic:.*0x0CD50CD0" ||
                error "$DIR/$tdir/${tdir}: invalid LMV EA magic"
@@ -4236,45 +4240,122 @@ test_33g() {
 }
 run_test 33g "nonroot user create already existing root created file"
 
-test_33h() {
-       [ $MDSCOUNT -lt 2 ] && skip_env "needs >= 2 MDTs"
-       [ $MDS1_VERSION -lt $(version_code 2.13.50) ] &&
-               skip "Need MDS version at least 2.13.50"
+sub_33h() {
+       local hash_type=$1
+       local count=250
 
-       test_mkdir -c $MDSCOUNT -H crush $DIR/$tdir ||
-               error "mkdir $tdir failed"
+       test_mkdir -c $MDSCOUNT -H $hash_type $DIR/$tdir ||
+               error "lfs mkdir -H $hash_type $tdir failed"
        touch $DIR/$tdir/$tfile || error "touch $tfile failed"
 
        local index=$($LFS getstripe -m $DIR/$tdir/$tfile)
        local index2
+       local fname
 
        for fname in $DIR/$tdir/$tfile.bak \
                     $DIR/$tdir/$tfile.SAV \
                     $DIR/$tdir/$tfile.orig \
                     $DIR/$tdir/$tfile~; do
-               touch $fname  || error "touch $fname failed"
+               touch $fname || error "touch $fname failed"
                index2=$($LFS getstripe -m $fname)
-               [ $index -eq $index2 ] ||
+               (( $index == $index2 )) ||
                        error "$fname MDT index mismatch $index != $index2"
        done
 
        local failed=0
-       for i in {1..250}; do
-               for fname in $(mktemp -u $DIR/$tdir/.$tfile.XXXXXX) \
-                            $(mktemp $DIR/$tdir/$tfile.XXXXXXXX); do
-                       touch $fname  || error "touch $fname failed"
+       local patterns=(".$tfile.XXXXXX" "$tfile.XXXXXXXX")
+       local pattern
+
+       for pattern in ${patterns[*]}; do
+               echo "pattern $pattern"
+               fname=$DIR/$tdir/$pattern
+               for (( i = 0; i < $count; i++ )); do
+                       fname=$(mktemp $DIR/$tdir/$pattern) ||
+                               error "mktemp $DIR/$tdir/$pattern failed"
                        index2=$($LFS getstripe -m $fname)
-                       if [[ $index != $index2 ]]; then
-                               failed=$((failed + 1))
-                               echo "$fname MDT index mismatch $index != $index2"
-                       fi
+                       (( $index == $index2 )) && continue
+
+                       failed=$((failed + 1))
+                       echo "$fname MDT index mismatch $index != $index2"
+               done
+       done
+
+       echo "$failed/$count MDT index mismatches, expect ~2-4"
+       (( failed < 10 )) || error "MDT index mismatch $failed/$count times"
+
+       local same=0
+       local expect
+
+       # verify that "crush" is still broken with all files on same MDT,
+       # crush2 should have about 1/MDSCOUNT files on each MDT, with margin
+       [[ "$hash_type" == "crush" ]] && expect=$count ||
+               expect=$((count / MDSCOUNT))
+
+       # crush2 doesn't put all-numeric suffixes on the same MDT,
+       # filename like $tfile.12345678 should *not* be considered temp
+       for pattern in ${patterns[*]}; do
+               local base=${pattern%%X*}
+               local suff=${pattern#$base}
+
+               echo "pattern $pattern"
+               for (( i = 0; i < $count; i++ )); do
+                       fname=$DIR/$tdir/$base$((${suff//X/1} + i))
+                       touch $fname || error "touch $fname failed"
+                       index2=$($LFS getstripe -m $fname)
+                       (( $index != $index2 )) && continue
+
+                       same=$((same + 1))
                done
        done
-       echo "$failed MDT index mismatches"
-       (( failed < 20 )) || error "MDT index mismatch $failed times"
 
+       echo "$((same/${#patterns[*]}))/$count matches, expect ~$expect for $1"
+       (( same / ${#patterns[*]} < expect * 5 / 4 &&
+          same / ${#patterns[*]} > expect * 4 / 5 )) ||
+               error "MDT index match $((same / ${#patterns[*]}))/$count times"
+       same=0
+
+       # crush2 doesn't put suffixes with special characters on the same MDT
+       # filename like $tfile.txt.1234 should *not* be considered temp
+       for pattern in ${patterns[*]}; do
+               local base=${pattern%%X*}
+               local suff=${pattern#$base}
+
+               pattern=$base...${suff/XXX}
+               echo "pattern=$pattern"
+               for (( i = 0; i < $count; i++ )); do
+                       fname=$(mktemp $DIR/$tdir/$pattern) ||
+                               error "touch $fname failed"
+                       index2=$($LFS getstripe -m $fname)
+                       (( $index != $index2 )) && continue
+
+                       same=$((same + 1))
+               done
+       done
+
+       echo "$((same/${#patterns[*]}))/$count matches, expect ~$expect for $1"
+       (( same / ${#patterns[*]} < expect * 5 / 4 &&
+          same / ${#patterns[*]} > expect * 4 / 5 )) ||
+               error "MDT index match $((same / ${#patterns[*]}))/$count times"
 }
-run_test 33h "temp file is located on the same MDT as target"
+
+test_33h() {
+       (( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
+       (( $MDS1_VERSION >= $(version_code 2.13.50) )) ||
+               skip "Need MDS version at least 2.13.50"
+
+       sub_33h crush
+}
+run_test 33h "temp file is located on the same MDT as target (crush)"
+
+test_33hh() {
+       (( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
+       echo "MDS1_VERSION=$MDS1_VERSION version_code=$(version_code 2.15.0)"
+       (( $MDS1_VERSION > $(version_code 2.15.0) )) ||
+               skip "Need MDS version at least 2.15.0 for crush2"
+
+       sub_33h crush2
+}
+run_test 33hh "temp file is located on the same MDT as target (crush2)"
 
 test_33i()
 {
@@ -23468,9 +23549,9 @@ test_300h() {
 run_test 300h "check default striped directory for striped directory"
 
 test_300i() {
-       [ $PARALLEL == "yes" ] && skip "skip parallel run"
-       [ $MDSCOUNT -lt 2 ] && skip_env "needs >= 2 MDTs"
-       [ $MDS1_VERSION -lt $(version_code 2.7.55) ] &&
+       [[ $PARALLEL == "yes" ]] && skip "skip parallel run"
+       (( $MDSCOUNT >= 2 )) || skip_env "needs >= 2 MDTs"
+       (( $MDS1_VERSION >= $(version_code 2.7.55) )) ||
                skip "Need MDS version at least 2.7.55"
 
        local stripe_count
@@ -23501,11 +23582,31 @@ test_300i() {
 
        $LFS find -H fnv_1a_64,crush $DIR/$tdir/hashdir
        local dircnt=$($LFS find -H fnv_1a_64,crush $DIR/$tdir/hashdir | wc -l)
-       [ $dircnt -eq 2 ] || error "lfs find striped dir got:$dircnt,except:1"
-
-       #set the stripe to be unknown hash type
-       #define OBD_FAIL_UNKNOWN_LMV_STRIPE     0x1901
-       $LCTL set_param fail_loc=0x1901
+       (( $dircnt == 2 )) || error "lfs find striped dir got $dircnt != 2"
+
+       if (( $MDS1_VERSION > $(version_code 2.15.0) )); then
+               $LFS mkdir -i0 -c$MDSCOUNT -H crush2 $DIR/$tdir/hashdir/d3 ||
+                       error "create crush2 dir $tdir/hashdir/d3 failed"
+               $LFS find -H crush2 $DIR/$tdir/hashdir
+               dircnt=$($LFS find -H crush2 $DIR/$tdir/hashdir | wc -l)
+               (( $dircnt == 1 )) || error "find crush2 dir got $dircnt != 1"
+
+               # mkdir with an invalid hash type (hash=fail_val) from client
+               # should be replaced on MDS with a valid (default) hash type
+               #define OBD_FAIL_LMV_UNKNOWN_STRIPE     0x1901
+               $LCTL set_param fail_loc=0x1901 fail_val=99
+               $LFS mkdir -c2 $DIR/$tdir/hashdir/d99
+
+               local hash=$($LFS getdirstripe -H $DIR/$tdir/hashdir/d99)
+               local expect=$(do_facet mds1 \
+                       $LCTL get_param -n lod.$FSNAME-MDT0000-mdtlov.mdt_hash)
+               [[ $hash == $expect ]] ||
+                       error "d99 hash '$hash' != expected hash '$expect'"
+       fi
+
+       #set the stripe to be unknown hash type on read
+       #define OBD_FAIL_LMV_UNKNOWN_STRIPE     0x1901
+       $LCTL set_param fail_loc=0x1901 fail_val=99
        for ((i = 0; i < 10; i++)); do
                $CHECKSTAT -t file $DIR/$tdir/striped_dir/f-$i ||
                        error "stat f-$i failed"
index f148131..3727f63 100755 (executable)
@@ -468,7 +468,7 @@ check_cpt_number() {
 # code is useful for comparison two version strings to see which is newer.
 version_code() {
        # split arguments like "1.8.6-wc3" into "1", "8", "6", "3"
-       eval set -- $(tr "[:punct:][a-z]" " " <<< $*)
+       eval set -- $(tr "[:punct:][a-zA-Z]" " " <<< $*)
 
        echo -n $(((${1:-0}<<24) | (${2:-0}<<16) | (${3:-0}<<8) | (${4:-0})))
 }
@@ -9073,6 +9073,9 @@ test_mkdir() {
        local dirstripe_index=${DIRSTRIPE_INDEX:-$((base % $MDSCOUNT))}
        local OPTIND=1
 
+       (( $MDS1_VERSION > $(version_code 2.15.0) )) &&
+               hash_name+=("crush2")
+
        while getopts "c:H:i:p" opt; do
                case $opt in
                        c) dirstripe_count=$OPTARG;;
index 823cda2..8f4e91b 100644 (file)
@@ -592,11 +592,10 @@ static int check_hashtype(const char *hashtype)
        int i;
 
        /* numeric hash type */
-       if (hashtype && strlen(hashtype) == 1 &&
-           (type_num > 0 && type_num < LMV_HASH_TYPE_MAX))
+       if (hashtype && lmv_is_known_hash_type(type_num))
                return type_num;
        /* string hash type */
-       for (i = LMV_HASH_TYPE_ALL_CHARS; i < LMV_HASH_TYPE_MAX; i++)
+       for (i = LMV_HASH_TYPE_ALL_CHARS; i < ARRAY_SIZE(mdt_hash_name); i++)
                if (strcmp(hashtype, mdt_hash_name[i]) == 0)
                        return i;
 
@@ -1371,7 +1370,9 @@ static int mdthash_input(char *string, __u32 *inflags,
                __u32 flag;
        } mhflist[] = {
                {"migrating", LMV_HASH_FLAG_MIGRATION},
+               {"bad_type", LMV_HASH_FLAG_BAD_TYPE},
                {"badtype", LMV_HASH_FLAG_BAD_TYPE},
+               {"lost_lmv", LMV_HASH_FLAG_LOST_LMV},
                {"lostlmv", LMV_HASH_FLAG_LOST_LMV},
        };
 
@@ -6758,7 +6759,7 @@ static int lfs_setdirstripe(int argc, char **argv)
                        lsa.lsa_pattern = check_hashtype(optarg);
                        if (lsa.lsa_pattern == 0) {
                                fprintf(stderr,
-                                       "%s %s: bad stripe hash type '%s'\n",
+                                       "%s %s: bad directory hash type '%s'\n",
                                        progname, argv[0], optarg);
                                return CMD_HELP;
                        }