Whamcloud - gitweb
LU-12624 lod: alloc dir stripes by QoS 25/35825/13
authorLai Siyao <lai.siyao@whamcloud.com>
Sun, 4 Aug 2019 18:08:02 +0000 (02:08 +0800)
committerOleg Drokin <green@whamcloud.com>
Mon, 28 Oct 2019 17:06:18 +0000 (17:06 +0000)
Similar to file OST object allocation, introduce directory stripe
allocation by space usage, but they don't share the same code because
of the many differences between them: file has mirrors, PFL, object
precreation; while for directory, the first stripe is always on the
same MDT where its master object is on. The changes include:
* add lod_mdt_alloc_qos() to allocate stripes by space/inode usage.
* add lod_mdt_alloc_rr() to allocate stripes round-robin.
* add lod_mdt_alloc_specific() to allocate stripes in the old way.
* add sysfs support for lmv_desc field in LOD structure, and move
  those remain in procfs to sysfs.

This patch also changes LMV QoS code:
* mkdir by QoS if user mkdir by command 'lfs mkdir -i -1 ...', or the
  parent directory default LMV starting MDT index is -1.
* with the above change, 'space' hash flag is useless, remove all
  related code.
* previously 'lfs mkdir -i -1' QoS code is in lfs_setdirstripe(),
  but now it's done in LMV, remove the old code.

Update sanity 413a 413b to support QoS mkdir of both plain and
striped directories.

Update lfs-setdirstripe man to reflect the changes.

Signed-off-by: Lai Siyao <lai.siyao@whamcloud.com>
Change-Id: I8f5f8e46faae68ffd9a49a4ac1d450e951e979c5
Reviewed-on: https://review.whamcloud.com/35825
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
22 files changed:
lustre/doc/lfs-setdirstripe.1
lustre/include/lprocfs_status.h
lustre/include/lustre_lmv.h
lustre/include/uapi/linux/lustre/lustre_user.h
lustre/lmv/lmv_intent.c
lustre/lmv/lmv_internal.h
lustre/lmv/lmv_obd.c
lustre/lod/lod_internal.h
lustre/lod/lod_lov.c
lustre/lod/lod_object.c
lustre/lod/lod_pool.c
lustre/lod/lod_qos.c
lustre/lod/lproc_lod.c
lustre/mdt/mdt_reint.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/lu_tgt_descs.c
lustre/ptlrpc/wiretest.c
lustre/tests/sanity.sh
lustre/utils/lfs.c
lustre/utils/liblustreapi.c
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index fda8be3..9f52fa8 100644 (file)
@@ -44,13 +44,6 @@ Fowler-Noll-Vo (FNV-1a) hash algorithm.  This provides
 reasonably uniform, but not cryptographically strong,
 hashing of the filename. (default)
 .TP
-.B space
-This can only be set on plain directory default striping.
-Newly created sub-directories will be distributed on all
-MDTs by MDT space usage. Note that this is suggested to
-be set on lustre ROOT, so that all sub-directories under
-lustre ROOT are distributed among all MDTs.
-.TP
 .B all_char
 Sum of ASCII characters modulo number of MDTs. This
 provides weak hashing of the filename, and is suitable
index 98a2b1a..2b66f7a 100644 (file)
@@ -614,6 +614,7 @@ extern ssize_t
 lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer,
                               size_t count, loff_t *off);
 
+int lu_str_to_s64(char *buffer, unsigned long count, __s64 *val, char defunit);
 extern int lprocfs_str_with_units_to_s64(const char __user *buffer,
                                         unsigned long count, __s64 *val,
                                         char defunit);
index 45c5366..d32b4ac 100644 (file)
@@ -54,12 +54,6 @@ struct lmv_stripe_md {
        struct lmv_oinfo lsm_md_oinfo[0];
 };
 
-static inline bool lmv_is_known_hash_type(__u32 type)
-{
-       return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
-              (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS;
-}
-
 static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm)
 {
        return lsm && lsm->lsm_md_magic == LMV_MAGIC;
@@ -88,12 +82,6 @@ static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm)
        return !lmv_is_known_hash_type(lsm->lsm_md_hash_type);
 }
 
-/* NB, this is checking directory default LMV */
-static inline bool lmv_dir_qos_mkdir(const struct lmv_stripe_md *lsm)
-{
-       return lsm && (lsm->lsm_md_hash_type & LMV_HASH_FLAG_SPACE);
-}
-
 static inline bool
 lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2)
 {
index d7d8d48..5c5cd41 100644 (file)
@@ -712,6 +712,7 @@ struct fsxattr {
 #define LOV_PATTERN_DEFAULT    0xffffffff
 
 #define LOV_OFFSET_DEFAULT      ((__u16)-1)
+#define LMV_OFFSET_DEFAULT      ((__u32)-1)
 
 static inline bool lov_pattern_supported(__u32 pattern)
 {
@@ -1001,10 +1002,11 @@ enum lmv_hash_type {
  * might be interpreted differently with different flags. */
 #define LMV_HASH_TYPE_MASK 0x0000ffff
 
-/* once this is set on a plain directory default layout, newly created
- * subdirectories will be distributed on all MDTs by space usage.
- */
-#define LMV_HASH_FLAG_SPACE    0x08000000
+static inline bool lmv_is_known_hash_type(__u32 type)
+{
+       return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 ||
+              (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS;
+}
 
 /* The striped directory has ever lost its master LMV EA, then LFSCK
  * re-generated it. This flag is used to indicate such case. It is an
index 43c6b66..11a78b1 100644 (file)
@@ -305,22 +305,10 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data,
                                /*
                                 * open(O_CREAT | O_EXCL) needs to check
                                 * existing name, which should be done on both
-                                * old and new layout, to avoid creating new
-                                * file under old layout, check old layout on
+                                * old and new layout, check old layout on
                                 * client side.
                                 */
-                               tgt = lmv_locate_tgt(lmv, op_data);
-                               if (IS_ERR(tgt))
-                                       RETURN(PTR_ERR(tgt));
-
-                               rc = md_getattr_name(tgt->ltd_exp, op_data,
-                                                    reqp);
-                               if (!rc) {
-                                       ptlrpc_req_finished(*reqp);
-                                       *reqp = NULL;
-                                       RETURN(-EEXIST);
-                               }
-
+                               rc = lmv_migrate_existence_check(lmv, op_data);
                                if (rc != -ENOENT)
                                        RETURN(rc);
 
index a58bebd..84a6d98 100644 (file)
@@ -49,7 +49,6 @@ int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data,
 int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *,
                     void *, int);
 int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds);
-int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds);
 int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
                  struct lu_fid *fid, struct md_op_data *op_data);
 
@@ -218,8 +217,9 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
 
 struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
                                    struct md_op_data *op_data);
+int lmv_migrate_existence_check(struct lmv_obd *lmv,
+                               struct md_op_data *op_data);
 
 /* lproc_lmv.c */
 int lmv_tunables_init(struct obd_device *obd);
-
 #endif
index a01505b..8af14da 100644 (file)
@@ -1055,111 +1055,38 @@ hsm_req_err:
        RETURN(rc);
 }
 
-/**
- * This is _inode_ placement policy function (not name).
- */
-static u32 lmv_placement_policy(struct obd_device *obd,
-                               struct md_op_data *op_data)
+int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+                 struct lu_fid *fid, struct md_op_data *op_data)
 {
+       struct obd_device *obd = class_exp2obd(exp);
        struct lmv_obd *lmv = &obd->u.lmv;
-       struct lmv_user_md *lum;
-       u32 mdt;
-
-       ENTRY;
-
-       if (lmv->lmv_mdt_count == 1)
-               RETURN(0);
-
-       lum = op_data->op_data;
-       /*
-        * Choose MDT by
-        * 1. See if the stripe offset is specified by lum.
-        * 2. If parent has default LMV, and its hash type is "space", choose
-        *    MDT with QoS. (see lmv_locate_tgt_qos()).
-        * 3. Then check if default LMV stripe offset is not -1.
-        * 4. Finally choose MDS by name hash if the parent
-        *    is striped directory. (see lmv_locate_tgt()).
-        *
-        * presently explicit MDT location is not supported
-        * for foreign dirs (as it can't be embedded into free
-        * format LMV, like with lum_stripe_offset), so we only
-        * rely on default stripe offset or then name hashing.
-        */
-       if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL &&
-           le32_to_cpu(lum->lum_magic != LMV_MAGIC_FOREIGN) &&
-           le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) {
-               mdt = le32_to_cpu(lum->lum_stripe_offset);
-       } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
-                  !lmv_dir_striped(op_data->op_mea1) &&
-                  lmv_dir_qos_mkdir(op_data->op_default_mea1)) {
-               mdt = op_data->op_mds;
-       } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
-                  op_data->op_default_mea1 &&
-                  op_data->op_default_mea1->lsm_md_master_mdt_index !=
-                       (__u32)-1) {
-               mdt = op_data->op_default_mea1->lsm_md_master_mdt_index;
-               op_data->op_mds = mdt;
-       } else {
-               mdt = op_data->op_mds;
-       }
-
-       RETURN(mdt);
-}
-
-int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
-{
        struct lmv_tgt_desc *tgt;
        int rc;
 
        ENTRY;
 
-       tgt = lmv_tgt(lmv, mds);
+       LASSERT(op_data);
+       LASSERT(fid);
+
+       tgt = lmv_tgt(lmv, op_data->op_mds);
        if (!tgt)
                RETURN(-ENODEV);
 
+       if (!tgt->ltd_active || !tgt->ltd_exp)
+               RETURN(-ENODEV);
+
        /*
         * New seq alloc and FLD setup should be atomic. Otherwise we may find
         * on server that seq in new allocated fid is not yet known.
         */
        mutex_lock(&tgt->ltd_fid_mutex);
-
-       if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL)
-               GOTO(out, rc = -ENODEV);
-
-       /*
-        * Asking underlying tgt layer to allocate new fid.
-        */
        rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL);
+       mutex_unlock(&tgt->ltd_fid_mutex);
        if (rc > 0) {
                LASSERT(fid_is_sane(fid));
                rc = 0;
        }
 
-       EXIT;
-out:
-       mutex_unlock(&tgt->ltd_fid_mutex);
-       return rc;
-}
-
-int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
-                 struct lu_fid *fid, struct md_op_data *op_data)
-{
-       struct obd_device *obd = class_exp2obd(exp);
-       struct lmv_obd *lmv = &obd->u.lmv;
-       u32 mds;
-       int rc;
-
-       ENTRY;
-
-       LASSERT(op_data != NULL);
-       LASSERT(fid != NULL);
-
-       mds = lmv_placement_policy(obd, op_data);
-
-       rc = __lmv_fid_alloc(lmv, fid, mds);
-       if (rc)
-               CERROR("Can't alloc new fid, rc %d\n", rc);
-
        RETURN(rc);
 }
 
@@ -1659,8 +1586,7 @@ lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
  * which is set outside, and if dir is migrating, 'op_data->op_post_migrate'
  * indicates whether old or new layout is used to locate.
  *
- * For plain direcotry, normally it will locate MDT by FID, but if this
- * directory has default LMV, and its hash type is "space", locate MDT with QoS.
+ * For plain direcotry, it just locate the MDT of op_data->op_fid1.
  *
  * \param[in] lmv      LMV device
  * \param[in] op_data  client MD stack parameters, name, namelen
@@ -1683,7 +1609,7 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data)
         * index if the file under striped dir is being restored, see
         * ct_restore(). */
        if (op_data->op_bias & MDS_CREATE_VOLATILE &&
-           (int)op_data->op_mds != -1) {
+           op_data->op_mds != LMV_OFFSET_DEFAULT) {
                tgt = lmv_tgt(lmv, op_data->op_mds);
                if (!tgt)
                        return ERR_PTR(-ENODEV);
@@ -1711,30 +1637,7 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data)
                op_data->op_mds = oinfo->lmo_mds;
                tgt = lmv_tgt(lmv, oinfo->lmo_mds);
                if (!tgt)
-                       tgt = ERR_PTR(-ENODEV);
-       } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
-                  lmv_dir_qos_mkdir(op_data->op_default_mea1) &&
-                  !lmv_dir_striped(lsm)) {
-               tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
-               if (tgt == ERR_PTR(-EAGAIN))
-                       tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
-               /*
-                * only update statfs when mkdir under dir with "space" hash,
-                * this means the cached statfs may be stale, and current mkdir
-                * may not follow QoS accurately, but it's not serious, and it
-                * avoids periodic statfs when client doesn't mkdir under
-                * "space" hashed directories.
-                *
-                * TODO: after MDT support QoS object allocation, also update
-                * statfs for 'lfs mkdir -i -1 ...", currently it's done in user
-                * space.
-                */
-               if (!IS_ERR(tgt)) {
-                       struct obd_device *obd;
-
-                       obd = container_of(lmv, struct obd_device, u.lmv);
-                       lmv_statfs_check_update(obd, tgt);
-               }
+                       return ERR_PTR(-ENODEV);
        } else {
                tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1,
                                op_data->op_name, op_data->op_namelen,
@@ -1787,6 +1690,78 @@ lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data)
                                &op_data->op_mds, true);
 }
 
+int lmv_migrate_existence_check(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+       struct lu_tgt_desc *tgt;
+       struct ptlrpc_request *request;
+       int rc;
+
+       LASSERT(lmv_dir_migrating(op_data->op_mea1));
+
+       tgt = lmv_locate_tgt(lmv, op_data);
+       if (IS_ERR(tgt))
+               return PTR_ERR(tgt);
+
+       rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+       if (!rc) {
+               ptlrpc_req_finished(request);
+               return -EEXIST;
+       }
+
+       return rc;
+}
+
+/* mkdir by QoS in two cases:
+ * 1. 'lfs mkdir -i -1'
+ * 2. parent default LMV master_mdt_index is -1
+ *
+ * NB, mkdir by QoS only if parent is not striped, this is to avoid remote
+ * directories under striped directory.
+ */
+static inline bool lmv_op_qos_mkdir(const struct md_op_data *op_data)
+{
+       const struct lmv_stripe_md *lsm = op_data->op_default_mea1;
+       const struct lmv_user_md *lum = op_data->op_data;
+
+       if (op_data->op_code != LUSTRE_OPC_MKDIR)
+               return false;
+
+       if (lmv_dir_striped(op_data->op_mea1))
+               return false;
+
+       if (op_data->op_cli_flags & CLI_SET_MEA && lum &&
+           (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+            le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) &&
+           le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT)
+               return true;
+
+       if (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT)
+               return true;
+
+       return false;
+}
+
+/* 'lfs mkdir -i <specific_MDT>' */
+static inline bool lmv_op_user_specific_mkdir(const struct md_op_data *op_data)
+{
+       const struct lmv_user_md *lum = op_data->op_data;
+
+       return op_data->op_code == LUSTRE_OPC_MKDIR &&
+              op_data->op_cli_flags & CLI_SET_MEA && lum &&
+              (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+               le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) &&
+              le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
+
+/* parent default LMV master_mdt_index is not -1. */
+static inline bool
+lmv_op_default_specific_mkdir(const struct md_op_data *op_data)
+{
+       return op_data->op_code == LUSTRE_OPC_MKDIR &&
+              op_data->op_default_mea1 &&
+              op_data->op_default_mea1->lsm_md_master_mdt_index !=
+                       LMV_OFFSET_DEFAULT;
+}
 int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
                const void *data, size_t datalen, umode_t mode, uid_t uid,
                gid_t gid, cfs_cap_t cap_effective, __u64 rdev,
@@ -1808,20 +1783,9 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
        if (lmv_dir_migrating(op_data->op_mea1)) {
                /*
                 * if parent is migrating, create() needs to lookup existing
-                * name, to avoid creating new file under old layout of
-                * migrating directory, check old layout here.
+                * name in both old and new layout, check old layout on client.
                 */
-               tgt = lmv_locate_tgt(lmv, op_data);
-               if (IS_ERR(tgt))
-                       RETURN(PTR_ERR(tgt));
-
-               rc = md_getattr_name(tgt->ltd_exp, op_data, request);
-               if (!rc) {
-                       ptlrpc_req_finished(*request);
-                       *request = NULL;
-                       RETURN(-EEXIST);
-               }
-
+               rc = lmv_migrate_existence_check(lmv, op_data);
                if (rc != -ENOENT)
                        RETURN(rc);
 
@@ -1832,26 +1796,44 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
        if (IS_ERR(tgt))
                RETURN(PTR_ERR(tgt));
 
-       CDEBUG(D_INODE, "CREATE name '%.*s' on "DFID" -> mds #%x\n",
-               (int)op_data->op_namelen, op_data->op_name,
-               PFID(&op_data->op_fid1), op_data->op_mds);
+       if (lmv_op_qos_mkdir(op_data)) {
+               tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
+               if (tgt == ERR_PTR(-EAGAIN))
+                       tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
+               /*
+                * only update statfs after QoS mkdir, this means the cached
+                * statfs may be stale, and current mkdir may not follow QoS
+                * accurately, but it's not serious, and avoids periodic statfs
+                * when client doesn't mkdir by QoS.
+                */
+               if (!IS_ERR(tgt))
+                       lmv_statfs_check_update(obd, tgt);
+       } else if (lmv_op_user_specific_mkdir(op_data)) {
+               struct lmv_user_md *lum = op_data->op_data;
+
+               op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+               tgt = lmv_tgt(lmv, op_data->op_mds);
+               if (!tgt)
+                       RETURN(-ENODEV);
+       } else if (lmv_op_default_specific_mkdir(op_data)) {
+               op_data->op_mds =
+                       op_data->op_default_mea1->lsm_md_master_mdt_index;
+               tgt = lmv_tgt(lmv, op_data->op_mds);
+               if (!tgt)
+                       RETURN(-ENODEV);
+       }
+
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
 
        rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
        if (rc)
                RETURN(rc);
 
-       if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) {
-               /* Send the create request to the MDT where the object
-                * will be located */
-               tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
-               if (IS_ERR(tgt))
-                       RETURN(PTR_ERR(tgt));
-
-               op_data->op_mds = tgt->ltd_index;
-       }
-
-       CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n",
-              PFID(&op_data->op_fid2), op_data->op_mds);
+       CDEBUG(D_INODE, "CREATE name '%.*s' "DFID" on "DFID" -> mds #%x\n",
+               (int)op_data->op_namelen, op_data->op_name,
+               PFID(&op_data->op_fid2), PFID(&op_data->op_fid1),
+               op_data->op_mds);
 
        op_data->op_flags |= MF_MDC_CANCEL_FID1;
        rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
@@ -2107,10 +2089,20 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
        if (IS_ERR(child_tgt))
                RETURN(PTR_ERR(child_tgt));
 
-       if (!S_ISDIR(op_data->op_mode) && tp_tgt)
-               rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_index);
-       else
-               rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
+       /* for directory, migrate to MDT specified by lum_stripe_offset;
+        * otherwise migrate to the target stripe of parent, but parent
+        * directory may have finished migration (normally current file too),
+        * allocate FID on MDT lum_stripe_offset, and server will check
+        * whether file was migrated already.
+        */
+       if (S_ISDIR(op_data->op_mode) || !tp_tgt) {
+               struct lmv_user_md *lum = op_data->op_data;
+
+               op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+       } else  {
+               op_data->op_mds = tp_tgt->ltd_index;
+       }
+       rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
        if (rc)
                RETURN(rc);
 
@@ -3127,7 +3119,7 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
                 * set default value -1, so lmv_locate_tgt() knows this stripe
                 * target is not initialized.
                 */
-               lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1;
+               lsm->lsm_md_oinfo[i].lmo_mds = LMV_OFFSET_DEFAULT;
                if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid))
                        continue;
 
index bf517f9..65ac3b9 100644 (file)
@@ -578,6 +578,7 @@ void lod_fix_desc_qos_maxage(__u32 *val);
 void lod_fix_desc_pattern(__u32 *val);
 void lod_fix_desc_stripe_count(__u32 *val);
 void lod_fix_desc_stripe_size(__u64 *val);
+void lod_fix_lmv_desc_pattern(__u32 *val);
 int lod_pools_init(struct lod_device *m, struct lustre_cfg *cfg);
 int lod_pools_fini(struct lod_device *m);
 int lod_parse_striping(const struct lu_env *env, struct lod_object *mo,
@@ -598,14 +599,14 @@ int lod_alloc_comp_entries(struct lod_object *lo, int mirror_cnt, int comp_cnt);
 int lod_fill_mirrors(struct lod_object *lo);
 
 /* lod_pool.c */
-int lod_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count);
-int lod_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx);
-int lod_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count);
+int lod_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count);
+int lod_tgt_pool_free(struct lu_tgt_pool *op);
+int lod_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count);
+int lod_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx);
+int lod_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count);
 struct pool_desc *lod_find_pool(struct lod_device *lod, char *poolname);
 void lod_pool_putref(struct pool_desc *pool);
-int lod_ost_pool_free(struct lu_tgt_pool *op);
 int lod_pool_del(struct obd_device *obd, char *poolname);
-int lod_ost_pool_init(struct lu_tgt_pool *op, unsigned int count);
 extern struct cfs_hash_ops pool_hash_operations;
 int lod_check_index_in_pool(__u32 idx, struct pool_desc *pool);
 int lod_pool_new(struct obd_device *obd, char *poolname);
@@ -637,6 +638,10 @@ struct lod_obj_stripe_cb_data {
 };
 
 /* lod_qos.c */
+int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo,
+                     struct dt_object **stripes);
+int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo,
+                    struct dt_object **stripe);
 int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
                       struct lu_attr *attr, const struct lu_buf *buf,
                       struct thandle *th);
@@ -652,7 +657,8 @@ __u16 lod_comp_entry_stripe_count(struct lod_object *lo,
                                  bool is_dir);
 __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo,
                           __u16 stripe_count, bool overstriping);
-void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod);
+void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod,
+                          struct lu_tgt_descs *ltd);
 
 /* lproc_lod.c */
 int lod_procfs_init(struct lod_device *lod);
index 6590d3c..e449db6 100644 (file)
@@ -89,10 +89,8 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd)
                                continue;
 
                        list_add(&tgt_desc->ltd_kill, &kill);
-                       /*FIXME: only support ost pool for now */
-                       if (ltd == &lod->lod_ost_descs)
-                               lod_ost_pool_remove(&ltd->ltd_tgt_pool,
-                                                   tgt_desc->ltd_index);
+                       lod_tgt_pool_remove(&ltd->ltd_tgt_pool,
+                                           tgt_desc->ltd_index);
                        ltd_del_tgt(ltd, tgt_desc);
                        ltd->ltd_death_row--;
                }
@@ -256,15 +254,12 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod,
        if (rc)
                GOTO(out_del_tgt, rc);
 
-       if (for_ost) {
-               /* pool is not supported for MDS stack yet */
-               rc = lod_ost_pool_add(&ltd->ltd_tgt_pool, index,
-                                     ltd->ltd_tgts_size);
-               if (rc) {
-                       CERROR("%s: can't set up pool, failed with %d\n",
-                              obd->obd_name, rc);
-                       GOTO(out_del_tgt, rc);
-               }
+       rc = lod_tgt_pool_add(&ltd->ltd_tgt_pool, index,
+                             ltd->ltd_lov_desc.ld_tgt_count);
+       if (rc) {
+               CERROR("%s: can't set up pool, failed with %d\n",
+                      obd->obd_name, rc);
+               GOTO(out_del_tgt, rc);
        }
 
        mutex_unlock(&ltd->ltd_mutex);
@@ -301,7 +296,7 @@ out_ltd:
                thread = LTD_TGT(ltd, index)->ltd_recovery_thread;
                OBD_FREE_PTR(thread);
        }
-       lod_ost_pool_remove(&ltd->ltd_tgt_pool, index);
+       lod_tgt_pool_remove(&ltd->ltd_tgt_pool, index);
 out_del_tgt:
        ltd_del_tgt(ltd, tgt_desc);
 out_mutex:
@@ -2006,6 +2001,14 @@ void lod_fix_desc_pattern(__u32 *val)
        }
 }
 
+void lod_fix_lmv_desc_pattern(__u32 *val)
+{
+       if ((*val) && !lmv_is_known_hash_type(*val)) {
+               LCONSOLE_WARN("lod: Unknown md stripe pattern: %#x\n", *val);
+               *val = 0;
+       }
+}
+
 void lod_fix_desc_qos_maxage(__u32 *val)
 {
        /* fix qos_maxage */
@@ -2026,6 +2029,14 @@ void lod_fix_desc(struct lov_desc *desc)
        lod_fix_desc_qos_maxage(&desc->ld_qos_maxage);
 }
 
+static void lod_fix_lmv_desc(struct lmv_desc *desc)
+{
+       desc->ld_active_tgt_count = 0;
+       lod_fix_desc_stripe_count(&desc->ld_default_stripe_count);
+       lod_fix_lmv_desc_pattern(&desc->ld_pattern);
+       lod_fix_desc_qos_maxage(&desc->ld_qos_maxage);
+}
+
 /**
  * Initialize the structures used to store pools and default striping.
  *
@@ -2076,6 +2087,9 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg)
        desc->ld_active_tgt_count = 0;
        lod->lod_ost_descs.ltd_lov_desc = *desc;
 
+       /* NB: config doesn't contain lmv_desc, alter it via sysfs. */
+       lod_fix_lmv_desc(&lod->lod_mdt_descs.ltd_lmv_desc);
+
        lod->lod_sp_me = LUSTRE_SP_CLI;
 
        /* Set up OST pool environment */
@@ -2091,17 +2105,30 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg)
 
        INIT_LIST_HEAD(&lod->lod_pool_list);
        lod->lod_pool_count = 0;
-       rc = lod_ost_pool_init(&lod->lod_ost_descs.ltd_tgt_pool, 0);
+       rc = lod_tgt_pool_init(&lod->lod_mdt_descs.ltd_tgt_pool, 0);
        if (rc)
                GOTO(out_hash, rc);
-       rc = lod_ost_pool_init(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool, 0);
+
+       rc = lod_tgt_pool_init(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool, 0);
+       if (rc)
+               GOTO(out_mdt_pool, rc);
+
+       rc = lod_tgt_pool_init(&lod->lod_ost_descs.ltd_tgt_pool, 0);
+       if (rc)
+               GOTO(out_mdt_rr_pool, rc);
+
+       rc = lod_tgt_pool_init(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool, 0);
        if (rc)
-               GOTO(out_pool_info, rc);
+               GOTO(out_ost_pool, rc);
 
        RETURN(0);
 
-out_pool_info:
-       lod_ost_pool_free(&lod->lod_ost_descs.ltd_tgt_pool);
+out_ost_pool:
+       lod_tgt_pool_free(&lod->lod_ost_descs.ltd_tgt_pool);
+out_mdt_rr_pool:
+       lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool);
+out_mdt_pool:
+       lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_tgt_pool);
 out_hash:
        cfs_hash_putref(lod->lod_pools_hash_body);
 
@@ -2131,8 +2158,10 @@ int lod_pools_fini(struct lod_device *lod)
        }
 
        cfs_hash_putref(lod->lod_pools_hash_body);
-       lod_ost_pool_free(&(lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool));
-       lod_ost_pool_free(&lod->lod_ost_descs.ltd_tgt_pool);
+       lod_tgt_pool_free(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool);
+       lod_tgt_pool_free(&lod->lod_ost_descs.ltd_tgt_pool);
+       lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool);
+       lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_tgt_pool);
 
        RETURN(0);
 }
index c7c2a3a..f25a490 100644 (file)
@@ -1945,74 +1945,73 @@ out:
        RETURN(rc);
 }
 
-static int lod_prep_md_striped_create(const struct lu_env *env,
-                                     struct dt_object *dt,
-                                     struct lu_attr *attr,
-                                     const struct lmv_user_md_v1 *lum,
-                                     struct dt_object_format *dof,
-                                     struct thandle *th)
+/**
+ * Allocate a striping on a predefined set of MDTs.
+ *
+ * Allocates new striping using the MDT index range provided by the data from
+ * the lum_obejcts contained in the lmv_user_md passed to this method if
+ * \a is_specific is true; or allocates new layout starting from MDT index in
+ * lo->ldo_dir_stripe_offset. The exact order of MDTs is not important and
+ * varies depending on MDT status. The number of stripes needed and stripe
+ * offset are taken from the object. If that number cannot be met, then the
+ * function returns an error and then it's the caller's responsibility to
+ * release the stripes allocated. All the internal structures are protected,
+ * but no concurrent allocation is allowed on the same objects.
+ *
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripes         striping created
+ * \param[out] mdt_indices     MDT indices of striping created
+ * \param[in] is_specific      true if the MDTs are provided by lum; false if
+ *                             only the starting MDT index is provided
+ *
+ * \retval positive    stripes allocated, including the first stripe allocated
+ *                     outside
+ * \retval negative    errno on failure
+ */
+static int lod_mdt_alloc_specific(const struct lu_env *env,
+                                 struct lod_object *lo,
+                                 struct dt_object **stripes,
+                                 __u32 *mdt_indices, bool is_specific)
 {
        struct lod_thread_info  *info = lod_env_info(env);
-       struct lod_device       *lod = lu2lod_dev(dt->do_lu.lo_dev);
-       struct lod_tgt_descs    *ltd = &lod->lod_mdt_descs;
-       struct lod_object       *lo = lod_dt_obj(dt);
-       struct dt_object        **stripe;
-       __u32                   stripe_count;
-       int                     *idx_array;
-       __u32                   master_index;
-       int                     rc = 0;
-       __u32                   i;
-       __u32                   j;
-       bool                    is_specific = false;
-       ENTRY;
-
-       /* The lum has been verifed in lod_verify_md_striping */
-       LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
-               le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC);
-
-       stripe_count = lo->ldo_dir_stripe_count;
-
-       OBD_ALLOC(idx_array, sizeof(idx_array[0]) * stripe_count);
-       if (idx_array == NULL)
-               RETURN(-ENOMEM);
-
-       OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_count);
-       if (stripe == NULL)
-               GOTO(out_free, rc = -ENOMEM);
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
+       struct lu_tgt_desc *tgt = NULL;
+       struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
+       struct dt_device *tgt_dt = NULL;
+       struct lu_fid fid = { 0 };
+       struct dt_object *dto;
+       u32 master_index;
+       u32 stripe_count = lo->ldo_dir_stripe_count;
+       int stripe_idx = 1;
+       int j;
+       int idx;
+       int rc;
 
-       /* Start index must be the master MDT */
        master_index = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
-       idx_array[0] = master_index;
-       if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
-               is_specific = true;
-               for (i = 1; i < stripe_count; i++)
-                       idx_array[i] = le32_to_cpu(lum->lum_objects[i].lum_mds);
-       }
-
-       for (i = 0; i < stripe_count; i++) {
-               struct lod_tgt_desc     *tgt = NULL;
-               struct dt_object        *dto;
-               struct lu_fid           fid = { 0 };
-               int                     idx;
-               struct lu_object_conf   conf = { 0 };
-               struct dt_device        *tgt_dt = NULL;
+       if (stripe_count > 1)
+               /* Set the start index for the 2nd stripe allocation */
+               mdt_indices[1] = (mdt_indices[0] + 1) %
+                                       (lod->lod_remote_mdt_count + 1);
 
+       for (; stripe_idx < stripe_count; stripe_idx++) {
                /* Try to find next avaible target */
-               idx = idx_array[i];
+               idx = mdt_indices[stripe_idx];
                for (j = 0; j < lod->lod_remote_mdt_count;
                     j++, idx = (idx + 1) % (lod->lod_remote_mdt_count + 1)) {
                        bool already_allocated = false;
                        __u32 k;
 
                        CDEBUG(D_INFO, "try idx %d, mdt cnt %u, allocated %u\n",
-                              idx, lod->lod_remote_mdt_count + 1, i);
+                              idx, lod->lod_remote_mdt_count + 1, stripe_idx);
 
                        if (likely(!is_specific &&
                                   !OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE))) {
                                /* check whether the idx already exists
                                 * in current allocated array */
-                               for (k = 0; k < i; k++) {
-                                       if (idx_array[k] == idx) {
+                               for (k = 0; k < stripe_idx; k++) {
+                                       if (mdt_indices[k] == idx) {
                                                already_allocated = true;
                                                break;
                                        }
@@ -2033,29 +2032,25 @@ static int lod_prep_md_striped_create(const struct lu_env *env,
                                rc = obd_fid_alloc(env, lod->lod_child_exp,
                                                   &fid, NULL);
                                if (rc < 0)
-                                       GOTO(out_put, rc);
+                                       continue;
                                tgt_dt = lod->lod_child;
                                break;
                        }
 
                        /* check the status of the OSP */
                        tgt = LTD_TGT(ltd, idx);
-                       if (tgt == NULL)
+                       if (!tgt)
                                continue;
 
                        tgt_dt = tgt->ltd_tgt;
                        rc = dt_statfs(env, tgt_dt, &info->lti_osfs);
-                       if (rc) {
+                       if (rc)
                                /* this OSP doesn't feel well */
-                               rc = 0;
                                continue;
-                       }
 
                        rc = obd_fid_alloc(env, tgt->ltd_exp, &fid, NULL);
-                       if (rc < 0) {
-                               rc = 0;
+                       if (rc < 0)
                                continue;
-                       }
 
                        break;
                }
@@ -2063,15 +2058,16 @@ static int lod_prep_md_striped_create(const struct lu_env *env,
                /* Can not allocate more stripes */
                if (j == lod->lod_remote_mdt_count) {
                        CDEBUG(D_INFO, "%s: require stripes %u only get %d\n",
-                              lod2obd(lod)->obd_name, stripe_count, i);
+                              lod2obd(lod)->obd_name, stripe_count,
+                              stripe_idx);
                        break;
                }
 
                CDEBUG(D_INFO, "Get idx %d, for stripe %d "DFID"\n",
-                      idx, i, PFID(&fid));
-               idx_array[i] = idx;
+                      idx, stripe_idx, PFID(&fid));
+               mdt_indices[stripe_idx] = idx;
                /* Set the start index for next stripe allocation */
-               if (!is_specific && i < stripe_count - 1) {
+               if (!is_specific && stripe_idx < stripe_count - 1) {
                        /*
                         * for large dir test, put all other slaves on one
                         * remote MDT, otherwise we may save too many local
@@ -2079,7 +2075,7 @@ static int lod_prep_md_striped_create(const struct lu_env *env,
                         */
                        if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)))
                                idx = master_index;
-                       idx_array[i + 1] = (idx + 1) %
+                       mdt_indices[stripe_idx + 1] = (idx + 1) %
                                           (lod->lod_remote_mdt_count + 1);
                }
                /* tgt_dt and fid must be ready after search avaible OSP
@@ -2088,47 +2084,124 @@ static int lod_prep_md_striped_create(const struct lu_env *env,
                LASSERT(fid_is_sane(&fid));
 
                /* fail a remote stripe FID allocation */
-               if (i && OBD_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_FID))
+               if (stripe_idx && OBD_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_FID))
                        continue;
 
-               conf.loc_flags = LOC_F_NEW;
                dto = dt_locate_at(env, tgt_dt, &fid,
-                                  dt->do_lu.lo_dev->ld_site->ls_top_dev,
-                                  &conf);
-               if (IS_ERR(dto))
-                       GOTO(out_put, rc = PTR_ERR(dto));
-               stripe[i] = dto;
+                                 lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
+                                 &conf);
+               if (IS_ERR(dto)) {
+                       rc = PTR_ERR(dto);
+                       goto error;
+               }
+
+               stripes[stripe_idx] = dto;
        }
 
+       return stripe_idx;
+
+error:
+       for (j = 1; j < stripe_idx; j++) {
+               LASSERT(stripes[j] != NULL);
+               dt_object_put(env, stripes[j]);
+               stripes[j] = NULL;
+       }
+       return rc;
+}
+
+static int lod_prep_md_striped_create(const struct lu_env *env,
+                                     struct dt_object *dt,
+                                     struct lu_attr *attr,
+                                     const struct lmv_user_md_v1 *lum,
+                                     struct dt_object_format *dof,
+                                     struct thandle *th)
+{
+       struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev);
+       struct lod_object *lo = lod_dt_obj(dt);
+       struct dt_object **stripes;
+       struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
+       struct lu_fid fid = { 0 };
+       __u32 stripe_count;
+       int i;
+       int rc = 0;
+
+       ENTRY;
+
+       /* The lum has been verifed in lod_verify_md_striping */
+       LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+               le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC);
+
+       stripe_count = lo->ldo_dir_stripe_count;
+
+       OBD_ALLOC(stripes, sizeof(stripes[0]) * stripe_count);
+       if (!stripes)
+               RETURN(-ENOMEM);
+
+       /* Allocate the first stripe locally */
+       rc = obd_fid_alloc(env, lod->lod_child_exp, &fid, NULL);
+       if (rc < 0)
+               GOTO(out, rc);
+
+       stripes[0] = dt_locate_at(env, lod->lod_child, &fid,
+                                 dt->do_lu.lo_dev->ld_site->ls_top_dev, &conf);
+       if (IS_ERR(stripes[0]))
+               GOTO(out, rc = PTR_ERR(stripes[0]));
+
+       if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) {
+               lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs);
+               rc = lod_mdt_alloc_qos(env, lo, stripes);
+               if (rc == -EAGAIN)
+                       rc = lod_mdt_alloc_rr(env, lo, stripes);
+       } else {
+               int *idx_array;
+               bool is_specific = false;
+
+               OBD_ALLOC(idx_array, sizeof(idx_array[0]) * stripe_count);
+               if (!idx_array)
+                       GOTO(out, rc = -ENOMEM);
+
+               if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) {
+                       is_specific = true;
+                       for (i = 0; i < stripe_count; i++)
+                               idx_array[i] =
+                                      le32_to_cpu(lum->lum_objects[i].lum_mds);
+               }
+
+               /* stripe 0 is local */
+               idx_array[0] =
+                       lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id;
+               rc = lod_mdt_alloc_specific(env, lo, stripes, idx_array,
+                                           is_specific);
+               OBD_FREE(idx_array, sizeof(idx_array[0]) * stripe_count);
+       }
+
+       if (rc < 0)
+               GOTO(out, rc);
+
+       LASSERT(rc > 0);
+
        lo->ldo_dir_striped = 1;
-       lo->ldo_stripe = stripe;
-       lo->ldo_dir_stripe_count = i;
+       lo->ldo_stripe = stripes;
+       lo->ldo_dir_stripe_count = rc;
        lo->ldo_dir_stripes_allocated = stripe_count;
        smp_mb();
        lo->ldo_dir_stripe_loaded = 1;
 
-       if (lo->ldo_dir_stripe_count == 0)
-               GOTO(out_put, rc = -ENOSPC);
-
        rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th);
-       if (rc != 0)
-               GOTO(out_put, rc);
+       if (rc < 0)
+               lod_striping_free(env, lo);
 
-out_put:
-       if (rc < 0) {
-               for (i = 0; i < stripe_count; i++)
-                       if (stripe[i] != NULL)
-                               dt_object_put(env, stripe[i]);
-               OBD_FREE(stripe, sizeof(stripe[0]) * stripe_count);
-               lo->ldo_dir_stripe_count = 0;
-               lo->ldo_dir_stripes_allocated = 0;
-               lo->ldo_stripe = NULL;
-       }
+       RETURN(rc);
 
-out_free:
-       OBD_FREE(idx_array, sizeof(idx_array[0]) * stripe_count);
+out:
+       LASSERT(rc < 0);
+       if (!IS_ERR_OR_NULL(stripes[0]))
+               dt_object_put(env, stripes[0]);
+       for (i = 1; i < stripe_count; i++)
+               LASSERT(!stripes[i]);
+       OBD_FREE(stripes, sizeof(stripes[0]) * stripe_count);
 
-       RETURN(rc);
+       return rc;
 }
 
 /**
@@ -3770,8 +3843,7 @@ static int lod_xattr_set_default_lmv_on_dir(const struct lu_env *env,
 
        if (LMVEA_DELETE_VALUES((le32_to_cpu(lum->lum_stripe_count)),
                                 le32_to_cpu(lum->lum_stripe_offset)) &&
-           le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC &&
-           !(le32_to_cpu(lum->lum_hash_type) & LMV_HASH_FLAG_SPACE)) {
+           le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC) {
                rc = lod_xattr_del_internal(env, dt, name, th);
                if (rc == -ENODATA)
                        rc = 0;
@@ -5136,8 +5208,7 @@ static void lod_striping_from_default(struct lod_object *lo,
                        lo->ldo_dir_stripe_offset =
                                lds->lds_dir_def_stripe_offset;
                if (lo->ldo_dir_hash_type == 0)
-                       lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type &
-                                               ~LMV_HASH_FLAG_SPACE;
+                       lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type;
 
                CDEBUG(D_LAYOUT, "striping from default dir: count:%hu, "
                       "offset:%u, hash_type:%u\n",
@@ -5593,7 +5664,7 @@ static inline int dt_object_qos_mkdir(const struct lu_env *env,
                return -EINVAL;
 
        lmu = info->lti_ea_store;
-       return !!(le32_to_cpu(lmu->lum_hash_type) & LMV_HASH_FLAG_SPACE);
+       return le32_to_cpu(lmu->lum_stripe_offset) == LMV_OFFSET_DEFAULT;
 }
 
 /**
@@ -5659,19 +5730,22 @@ static int lod_declare_create(const struct lu_env *env, struct dt_object *dt,
                        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STALE_DIR_LAYOUT))
                                GOTO(out, rc = -EREMOTE);
 
-                       if (lo->ldo_dir_stripe_offset == -1) {
+                       if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) {
+                               struct lod_default_striping *lds;
+
+                               lds = lo->ldo_def_striping;
                                /*
-                                * child and parent should be in the same MDT,
-                                * but if parent has plain layout, it's allowed.
+                                * child and parent should be on the same MDT,
+                                * but if parent has default LMV, and the start
+                                * MDT offset is -1, it's allowed. This check
+                                * is not necessary after 2.12.22 because client
+                                * follows this already, but old client may not.
                                 */
                                if (hint->dah_parent &&
-                                   dt_object_remote(hint->dah_parent)) {
-                                       rc = dt_object_qos_mkdir(env,
-                                                      lo->ldo_obj.do_lu.lo_dev,
-                                                      hint->dah_parent);
-                                       if (rc <= 0)
-                                               GOTO(out, rc ? rc : -EREMOTE);
-                               }
+                                   dt_object_remote(hint->dah_parent) && lds &&
+                                   lds->lds_dir_def_stripe_offset !=
+                                   LMV_OFFSET_DEFAULT)
+                                       GOTO(out, rc = -EREMOTE);
                        } else if (lo->ldo_dir_stripe_offset !=
                                   ss->ss_node_id) {
                                struct lod_device *lod;
@@ -7178,7 +7252,7 @@ static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo,
         * This algo can be revised later after knowing the topology of
         * cluster.
         */
-       lod_qos_statfs_update(env, lod);
+       lod_qos_statfs_update(env, lod, &lod->lod_ost_descs);
        for (i = 0; i < lo->ldo_mirror_count; i++) {
                bool ost_avail = true;
                int index = (i + seq) % lo->ldo_mirror_count;
index eb23de4..51cff07 100644 (file)
@@ -99,8 +99,8 @@ void lod_pool_putref(struct pool_desc *pool)
                LASSERT(hlist_unhashed(&pool->pool_hash));
                LASSERT(list_empty(&pool->pool_list));
                LASSERT(pool->pool_proc_entry == NULL);
-               lod_ost_pool_free(&(pool->pool_rr.lqr_pool));
-               lod_ost_pool_free(&(pool->pool_obds));
+               lod_tgt_pool_free(&(pool->pool_rr.lqr_pool));
+               lod_tgt_pool_free(&(pool->pool_obds));
                OBD_FREE_PTR(pool);
                EXIT;
        }
@@ -464,7 +464,7 @@ void lod_dump_pool(int level, struct pool_desc *pool)
  * \retval             negative error number on failure
  */
 #define POOL_INIT_COUNT 2
-int lod_ost_pool_init(struct lu_tgt_pool *op, unsigned int count)
+int lod_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count)
 {
        ENTRY;
 
@@ -496,7 +496,7 @@ int lod_ost_pool_init(struct lu_tgt_pool *op, unsigned int count)
  * \retval             0 on success
  * \retval             negative error number on failure.
  */
-int lod_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count)
+int lod_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count)
 {
        __u32 *new;
        __u32 new_size;
@@ -534,7 +534,7 @@ int lod_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count)
  * \retval             0 if target could be added to the pool
  * \retval             negative error if target \a idx was not added
  */
-int lod_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count)
+int lod_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count)
 {
        unsigned int i;
        int rc = 0;
@@ -542,7 +542,7 @@ int lod_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count)
 
        down_write(&op->op_rw_sem);
 
-       rc = lod_ost_pool_extend(op, min_count);
+       rc = lod_tgt_pool_extend(op, min_count);
        if (rc)
                GOTO(out, rc);
 
@@ -574,7 +574,7 @@ out:
  * \retval             0 on success
  * \retval             negative error number on failure
  */
-int lod_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx)
+int lod_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx)
 {
        unsigned int i;
        ENTRY;
@@ -608,7 +608,7 @@ int lod_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx)
  *
  * \retval             0 on success or if pool was already freed
  */
-int lod_ost_pool_free(struct lu_tgt_pool *op)
+int lod_tgt_pool_free(struct lu_tgt_pool *op)
 {
        ENTRY;
 
@@ -657,13 +657,13 @@ int lod_pool_new(struct obd_device *obd, char *poolname)
        strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name));
        new_pool->pool_lobd = obd;
        atomic_set(&new_pool->pool_refcount, 1);
-       rc = lod_ost_pool_init(&new_pool->pool_obds, 0);
+       rc = lod_tgt_pool_init(&new_pool->pool_obds, 0);
        if (rc)
                GOTO(out_err, rc);
 
        lu_qos_rr_init(&new_pool->pool_rr);
 
-       rc = lod_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+       rc = lod_tgt_pool_init(&new_pool->pool_rr.lqr_pool, 0);
        if (rc)
                GOTO(out_free_pool_obds, rc);
 
@@ -708,9 +708,9 @@ out_err:
 
        lprocfs_remove(&new_pool->pool_proc_entry);
 
-       lod_ost_pool_free(&new_pool->pool_rr.lqr_pool);
+       lod_tgt_pool_free(&new_pool->pool_rr.lqr_pool);
 out_free_pool_obds:
-       lod_ost_pool_free(&new_pool->pool_obds);
+       lod_tgt_pool_free(&new_pool->pool_obds);
        OBD_FREE_PTR(new_pool);
        return rc;
 }
@@ -791,8 +791,8 @@ int lod_pool_add(struct obd_device *obd, char *poolname, char *ostname)
        if (rc)
                GOTO(out, rc);
 
-       rc = lod_ost_pool_add(&pool->pool_obds, tgt->ltd_index,
-                             lod->lod_ost_descs.ltd_tgts_size);
+       rc = lod_tgt_pool_add(&pool->pool_obds, tgt->ltd_index,
+                             lod->lod_ost_count);
        if (rc)
                GOTO(out, rc);
 
@@ -849,8 +849,7 @@ int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
        if (rc)
                GOTO(out, rc);
 
-       lod_ost_pool_remove(&pool->pool_obds, ost->ltd_index);
-
+       lod_tgt_pool_remove(&pool->pool_obds, ost->ltd_index);
        pool->pool_rr.lqr_dirty = 1;
 
        CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname,
index 1892dee..149b06f 100644 (file)
 #define TGT_BAVAIL(i) (OST_TGT(lod,i)->ltd_statfs.os_bavail * \
                       OST_TGT(lod,i)->ltd_statfs.os_bsize)
 
+static inline int lod_statfs_check(struct lu_tgt_descs *ltd,
+                                  struct lu_tgt_desc *tgt)
+{
+       struct obd_statfs *sfs = &tgt->ltd_statfs;
+
+       if (((sfs->os_state & OS_STATE_ENOSPC) ||
+           (!ltd->ltd_is_mdt && sfs->os_state & OS_STATE_ENOINO &&
+            sfs->os_fprecreated == 0)))
+               return -ENOSPC;
+
+       /* If the OST is readonly then we can't allocate objects there */
+       if (sfs->os_state & OS_STATE_READONLY)
+               return -EROFS;
+
+       /* object precreation is skipped on the OST with max_create_count=0 */
+       if (!ltd->ltd_is_mdt && sfs->os_state & OS_STATE_NOPRECREATE)
+               return -ENOBUFS;
+
+       return 0;
+}
+
 /**
- * Check whether the target is available for new OST objects.
+ * Check whether the target is available for new objects.
  *
  * Request statfs data from the given target and verify it's active and not
- * read-only. If so, then it can be used to place new OST objects. This
+ * read-only. If so, then it can be used to place new objects. This
  * function also maintains the number of active/inactive targets and sets
  * dirty flags if those numbers change so others can run re-balance procedures.
  * No external locking is required.
  * \param[in] env      execution environment for this thread
  * \param[in] d                LOD device
  * \param[in] ltd      target table
- * \param[in] index    target index
- * \param[out] sfs     buffer for statfs data
+ * \param[in] tgt      target
  *
  * \retval 0           if the target is good
  * \retval negative    negated errno on error
-
  */
 static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
-                               struct lu_tgt_descs *ltd, int index,
-                               struct obd_statfs *sfs)
+                               struct lu_tgt_descs *ltd,
+                               struct lu_tgt_desc *tgt)
 {
        struct lov_desc *desc = &ltd->ltd_lov_desc;
-       struct lu_tgt_desc *tgt = LTD_TGT(ltd, index);
        int rc;
 
-       ENTRY;
-
        LASSERT(d);
        LASSERT(tgt);
 
-       rc = dt_statfs(env, tgt->ltd_tgt, sfs);
-
-       if (rc == 0 && ((sfs->os_state & OS_STATE_ENOSPC) ||
-           (sfs->os_state & OS_STATE_ENOINO && sfs->os_fprecreated == 0)))
-               RETURN(-ENOSPC);
-
+       rc = dt_statfs(env, tgt->ltd_tgt, &tgt->ltd_statfs);
        if (rc && rc != -ENOTCONN)
                CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc);
 
-       /* If the OST is readonly then we can't allocate objects there */
-       if (sfs->os_state & OS_STATE_READONLY)
-               rc = -EROFS;
-
-       /* object precreation is skipped on the OST with max_create_count=0 */
-       if (sfs->os_state & OS_STATE_NOPRECREATE)
-               rc = -ENOBUFS;
+       if (!rc) {
+               rc = lod_statfs_check(ltd, tgt);
+               if (rc == -ENOSPC)
+                       return rc;
+       }
 
        /* check whether device has changed state (active, inactive) */
        if (rc != 0 && tgt->ltd_active) {
@@ -144,7 +153,21 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
                spin_unlock(&d->lod_lock);
        }
 
-       RETURN(rc);
+       return rc;
+}
+
+static int lod_is_tgt_usable(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt)
+{
+       int rc;
+
+       rc = lod_statfs_check(ltd, tgt);
+       if (rc)
+               return rc;
+
+       if (!tgt->ltd_active)
+               return -ENOTCONN;
+
+       return 0;
 }
 
 /**
@@ -156,43 +179,41 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
  *
  * \param[in] env      execution environment for this thread
  * \param[in] lod      LOD device
+ * \param[in] ltd      tgt table
  */
-void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod)
+void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod,
+                          struct lu_tgt_descs *ltd)
 {
        struct obd_device *obd = lod2obd(lod);
-       struct lu_tgt_pool *osts = &lod->lod_ost_descs.ltd_tgt_pool;
+       struct lu_tgt_desc *tgt;
        time64_t max_age;
-       unsigned int i;
        u64 avail;
-       int idx;
        ENTRY;
 
-       max_age = ktime_get_seconds() -
-                 2 * lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage;
+       max_age = ktime_get_seconds() - 2 * ltd->ltd_lov_desc.ld_qos_maxage;
 
        if (obd->obd_osfs_age > max_age)
                /* statfs data are quite recent, don't need to refresh it */
                RETURN_EXIT;
 
-       down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
+       down_write(&ltd->ltd_qos.lq_rw_sem);
 
        if (obd->obd_osfs_age > max_age)
                goto out;
 
-       for (i = 0; i < osts->op_count; i++) {
-               idx = osts->op_array[i];
-               avail = OST_TGT(lod,idx)->ltd_statfs.os_bavail;
-               if (lod_statfs_and_check(env, lod, &lod->lod_ost_descs, idx,
-                                        &OST_TGT(lod, idx)->ltd_statfs))
+       ltd_foreach_tgt(ltd, tgt) {
+               avail = tgt->ltd_statfs.os_bavail;
+               if (lod_statfs_and_check(env, lod, ltd, tgt))
                        continue;
-               if (OST_TGT(lod,idx)->ltd_statfs.os_bavail != avail)
+
+               if (tgt->ltd_statfs.os_bavail != avail)
                        /* recalculate weigths */
-                       lod->lod_ost_descs.ltd_qos.lq_dirty = 1;
+                       ltd->ltd_qos.lq_dirty = 1;
        }
        obd->obd_osfs_age = ktime_get_seconds();
 
 out:
-       up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
+       up_write(&ltd->ltd_qos.lq_rw_sem);
        EXIT;
 }
 
@@ -208,17 +229,19 @@ out:
  * a new target or activation/deactivation).
  *
  * \param[in] lod      LOD device
- * \param[in] src_pool OST pool
+ * \param[in] ltd      tgt table
+ * \param[in] src_pool tgt pool
  * \param[in] lqr      round-robin list
  *
  * \retval 0           on success
  * \retval -ENOMEM     fails to allocate the array
  */
-static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool,
+static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_descs *ltd,
+                          const struct lu_tgt_pool *src_pool,
                           struct lu_qos_rr *lqr)
 {
-       struct lu_svr_qos  *oss;
-       struct lod_tgt_desc *ost;
+       struct lu_svr_qos  *svr;
+       struct lu_tgt_desc *tgt;
        unsigned placed, real_count;
        unsigned int i;
        int rc;
@@ -230,7 +253,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool,
        }
 
        /* Do actual allocation. */
-       down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
+       down_write(&ltd->ltd_qos.lq_rw_sem);
 
        /*
         * Check again. While we were sleeping on @lq_rw_sem something could
@@ -238,7 +261,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool,
         */
        if (!lqr->lqr_dirty) {
                LASSERT(lqr->lqr_pool.op_size);
-               up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
+               up_write(&ltd->ltd_qos.lq_rw_sem);
                RETURN(0);
        }
 
@@ -249,34 +272,33 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool,
           deleting from the pool. The lq_rw_sem insures that nobody else
           is reading. */
        lqr->lqr_pool.op_count = real_count;
-       rc = lod_ost_pool_extend(&lqr->lqr_pool, real_count);
+       rc = lod_tgt_pool_extend(&lqr->lqr_pool, real_count);
        if (rc) {
-               up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
+               up_write(&ltd->ltd_qos.lq_rw_sem);
                RETURN(rc);
        }
        for (i = 0; i < lqr->lqr_pool.op_count; i++)
                lqr->lqr_pool.op_array[i] = LOV_QOS_EMPTY;
 
-       /* Place all the OSTs from 1 OSS at the same time. */
+       /* Place all the tgts from 1 svr at the same time. */
        placed = 0;
-       list_for_each_entry(oss, &lod->lod_ost_descs.ltd_qos.lq_svr_list,
-                           lsq_svr_list) {
+       list_for_each_entry(svr, &ltd->ltd_qos.lq_svr_list, lsq_svr_list) {
                int j = 0;
 
                for (i = 0; i < lqr->lqr_pool.op_count; i++) {
                        int next;
 
-                       if (!cfs_bitmap_check(lod->lod_ost_bitmap,
-                                               src_pool->op_array[i]))
+                       if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap,
+                                             src_pool->op_array[i]))
                                continue;
 
-                       ost = OST_TGT(lod,src_pool->op_array[i]);
-                       LASSERT(ost && ost->ltd_tgt);
-                       if (ost->ltd_qos.ltq_svr != oss)
+                       tgt = LTD_TGT(ltd, src_pool->op_array[i]);
+                       LASSERT(tgt && tgt->ltd_tgt);
+                       if (tgt->ltd_qos.ltq_svr != svr)
                                continue;
 
-                       /* Evenly space these OSTs across arrayspace */
-                       next = j * lqr->lqr_pool.op_count / oss->lsq_tgt_count;
+                       /* Evenly space these tgts across arrayspace */
+                       next = j * lqr->lqr_pool.op_count / svr->lsq_tgt_count;
                        while (lqr->lqr_pool.op_array[next] != LOV_QOS_EMPTY)
                                next = (next + 1) % lqr->lqr_pool.op_count;
 
@@ -287,15 +309,15 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool,
        }
 
        lqr->lqr_dirty = 0;
-       up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
+       up_write(&ltd->ltd_qos.lq_rw_sem);
 
        if (placed != real_count) {
                /* This should never happen */
-               LCONSOLE_ERROR_MSG(0x14e, "Failed to place all OSTs in the "
+               LCONSOLE_ERROR_MSG(0x14e, "Failed to place all tgts in the "
                                   "round-robin list (%d of %d).\n",
                                   placed, real_count);
                for (i = 0; i < lqr->lqr_pool.op_count; i++) {
-                       LCONSOLE(D_WARNING, "rr #%d ost idx=%d\n", i,
+                       LCONSOLE(D_WARNING, "rr #%d tgt idx=%d\n", i,
                                 lqr->lqr_pool.op_array[i]);
                }
                lqr->lqr_dirty = 1;
@@ -401,7 +423,7 @@ static int min_stripe_count(__u32 stripe_count, int flags)
 #define LOV_CREATE_RESEED_MIN  2000
 
 /**
- * Initialize temporary OST-in-use array.
+ * Initialize temporary tgt-in-use array.
  *
  * Allocate or extend the array used to mark targets already assigned to a new
  * striping so they are not used more than once.
@@ -412,7 +434,7 @@ static int min_stripe_count(__u32 stripe_count, int flags)
  * \retval 0           on success
  * \retval -ENOMEM     on error
  */
-static inline int lod_qos_ost_in_use_clear(const struct lu_env *env,
+static inline int lod_qos_tgt_in_use_clear(const struct lu_env *env,
                                           __u32 stripes)
 {
        struct lod_thread_info *info = lod_env_info(env);
@@ -431,43 +453,44 @@ static inline int lod_qos_ost_in_use_clear(const struct lu_env *env,
  * Remember a target in the array of used targets.
  *
  * Mark the given target as used for a new striping being created. The status
- * of an OST in a striping can be checked with lod_qos_is_ost_used().
+ * of an tgt in a striping can be checked with lod_qos_is_tgt_used().
  *
  * \param[in] env      execution environment for this thread
  * \param[in] idx      index in the array
- * \param[in] ost      OST target index to mark as used
+ * \param[in] tgt_idx  target index to mark as used
  */
-static inline void lod_qos_ost_in_use(const struct lu_env *env,
-                                     int idx, int ost)
+static inline void lod_qos_tgt_in_use(const struct lu_env *env,
+                                     int idx, int tgt_idx)
 {
        struct lod_thread_info *info = lod_env_info(env);
-       int *osts = info->lti_ea_store;
+       int *tgts = info->lti_ea_store;
 
        LASSERT(info->lti_ea_store_size >= idx * sizeof(int));
-       osts[idx] = ost;
+       tgts[idx] = tgt_idx;
 }
 
 /**
- * Check is OST used in a striping.
+ * Check is tgt used in a striping.
  *
- * Checks whether OST with the given index is marked as used in the temporary
- * array (see lod_qos_ost_in_use()).
+ * Checks whether tgt with the given index is marked as used in the temporary
+ * array (see lod_qos_tgt_in_use()).
  *
  * \param[in] env      execution environment for this thread
- * \param[in] ost      OST target index to check
+ * \param[in] tgt_idx  target index to check
  * \param[in] stripes  the number of items used in the array already
  *
  * \retval 0           not used
  * \retval 1           used
  */
-static int lod_qos_is_ost_used(const struct lu_env *env, int ost, __u32 stripes)
+static int lod_qos_is_tgt_used(const struct lu_env *env, int tgt_idx,
+                              __u32 stripes)
 {
        struct lod_thread_info *info = lod_env_info(env);
-       int *osts = info->lti_ea_store;
+       int *tgts = info->lti_ea_store;
        __u32 j;
 
        for (j = 0; j < stripes; j++) {
-               if (osts[j] == ost)
+               if (tgts[j] == tgt_idx)
                        return 1;
        }
        return 0;
@@ -580,8 +603,7 @@ static inline bool lod_should_avoid_ost(struct lod_object *lo,
 static int lod_check_and_reserve_ost(const struct lu_env *env,
                                     struct lod_object *lo,
                                     struct lod_layout_component *lod_comp,
-                                    struct obd_statfs *sfs, __u32 ost_idx,
-                                    __u32 speed, __u32 *s_idx,
+                                    __u32 ost_idx, __u32 speed, __u32 *s_idx,
                                     struct dt_object **stripe,
                                     __u32 *ost_indices,
                                     struct thandle *th,
@@ -589,12 +611,14 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
 {
        struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
        struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
+       struct lu_tgt_desc *ost = OST_TGT(lod, ost_idx);
        struct dt_object   *o;
        __u32 stripe_idx = *s_idx;
        int rc;
+
        ENTRY;
 
-       rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost_idx, sfs);
+       rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost);
        if (rc)
                RETURN(rc);
 
@@ -602,7 +626,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
         * We expect number of precreated objects in f_ffree at
         * the first iteration, skip OSPs with no objects ready
         */
-       if (sfs->os_fprecreated == 0 && speed == 0) {
+       if (ost->ltd_statfs.os_fprecreated == 0 && speed == 0) {
                QOS_DEBUG("#%d: precreation is empty\n", ost_idx);
                RETURN(rc);
        }
@@ -610,7 +634,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
        /*
         * try to use another OSP if this one is degraded
         */
-       if (sfs->os_state & OS_STATE_DEGRADED && speed < 2) {
+       if (ost->ltd_statfs.os_state & OS_STATE_DEGRADED && speed < 2) {
                QOS_DEBUG("#%d: degraded\n", ost_idx);
                RETURN(rc);
        }
@@ -630,13 +654,13 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
         * for the first and second time.
         */
        if (speed < 2 && lod_should_avoid_ost(lo, lag, ost_idx)) {
-               QOS_DEBUG("iter %d: OST%d used by conflicting mirror "
-                         "component\n", speed, ost_idx);
+               QOS_DEBUG("iter %d: OST%d used by conflicting mirror component\n",
+                         speed, ost_idx);
                RETURN(rc);
        }
 
        /* do not put >1 objects on a single OST, except for overstriping */
-       if (lod_qos_is_ost_used(env, ost_idx, stripe_idx)) {
+       if (lod_qos_is_tgt_used(env, ost_idx, stripe_idx)) {
                if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
                        *overstriped = true;
                else
@@ -655,7 +679,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
         * We've successfully declared (reserved) an object
         */
        lod_avoid_update(lo, lag);
-       lod_qos_ost_in_use(env, stripe_idx, ost_idx);
+       lod_qos_tgt_in_use(env, stripe_idx, ost_idx);
        stripe[stripe_idx] = o;
        ost_indices[stripe_idx] = ost_idx;
        OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LOV_CREATE_RACE, 2);
@@ -691,13 +715,12 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
  * \retval -ENOSPC     if not enough OSTs are found
  * \retval negative    negated errno for other failures
  */
-static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
-                       struct dt_object **stripe, __u32 *ost_indices,
-                       int flags, struct thandle *th, int comp_idx)
+static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo,
+                           struct dt_object **stripe, __u32 *ost_indices,
+                           int flags, struct thandle *th, int comp_idx)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
-       struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
        struct pool_desc  *pool = NULL;
        struct lu_tgt_pool *osts;
        struct lu_qos_rr *lqr;
@@ -727,11 +750,11 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
                lqr = &(m->lod_ost_descs.ltd_qos.lq_rr);
        }
 
-       rc = lod_qos_calc_rr(m, osts, lqr);
+       rc = lod_qos_calc_rr(m, &m->lod_ost_descs, osts, lqr);
        if (rc)
                GOTO(out, rc);
 
-       rc = lod_qos_ost_in_use_clear(env, stripe_count);
+       rc = lod_qos_tgt_in_use_clear(env, stripe_count);
        if (rc)
                GOTO(out, rc);
 
@@ -786,7 +809,7 @@ repeat_find:
                        continue;
 
                spin_unlock(&lqr->lqr_alloc);
-               rc = lod_check_and_reserve_ost(env, lo, lod_comp, sfs, ost_idx,
+               rc = lod_check_and_reserve_ost(env, lo, lod_comp, ost_idx,
                                               speed, &stripe_idx, stripe,
                                               ost_indices, th, &overstriped);
                spin_lock(&lqr->lqr_alloc);
@@ -835,6 +858,165 @@ out:
 }
 
 /**
+ * Allocate a striping using round-robin algorithm.
+ *
+ * Allocates a new striping using round-robin algorithm. The function refreshes
+ * all the internal structures (statfs cache, array of available remote MDTs
+ * sorted with regard to MDS, etc). The number of stripes required is taken from
+ * the object (must be prepared by the caller). The caller should ensure nobody
+ * else is trying to create a striping on the object in parallel. All the
+ * internal structures (like pools, etc) are protected and no additional locking
+ * is required. The function succeeds even if a single stripe is allocated.
+ *
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ *
+ * \retval positive    stripe objects allocated, including the first stripe
+ *                     allocated outside
+ * \retval -ENOSPC     if not enough MDTs are found
+ * \retval negative    negated errno for other failures
+ */
+int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo,
+                    struct dt_object **stripe)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
+       struct lu_tgt_pool *pool;
+       struct lu_qos_rr *lqr;
+       struct lu_tgt_desc *mdt;
+       struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
+       struct lu_fid fid = { 0 };
+       struct dt_object *dto;
+       unsigned int pool_idx;
+       unsigned int i;
+       u32 start_idx_temp;
+       u32 stripe_count = lo->ldo_dir_stripe_count;
+       u32 stripe_idx = 1;
+       u32 mdt_idx;
+       bool use_degraded = false;
+       int tgt_connecting = 0;
+       int rc;
+
+       ENTRY;
+
+       pool = &ltd->ltd_tgt_pool;
+       lqr = &ltd->ltd_qos.lq_rr;
+       rc = lod_qos_calc_rr(lod, ltd, pool, lqr);
+       if (rc)
+               RETURN(rc);
+
+       rc = lod_qos_tgt_in_use_clear(env, stripe_count);
+       if (rc)
+               RETURN(rc);
+
+       down_read(&ltd->ltd_qos.lq_rw_sem);
+       spin_lock(&lqr->lqr_alloc);
+       if (--lqr->lqr_start_count <= 0) {
+               lqr->lqr_start_idx = prandom_u32_max(pool->op_count);
+               lqr->lqr_start_count =
+                       (LOV_CREATE_RESEED_MIN / max(pool->op_count, 1U) +
+                        LOV_CREATE_RESEED_MULT) * max(pool->op_count, 1U);
+       } else if (stripe_count - 1 >= pool->op_count ||
+                  lqr->lqr_start_idx > pool->op_count) {
+               /* If we have allocated from all of the tgts, slowly
+                * precess the next start if the tgt/stripe count isn't
+                * already doing this for us. */
+               lqr->lqr_start_idx %= pool->op_count;
+               if (stripe_count - 1 > 1 &&
+                   (pool->op_count % (stripe_count - 1)) != 1)
+                       ++lqr->lqr_offset_idx;
+       }
+       start_idx_temp = lqr->lqr_start_idx;
+
+repeat_find:
+       QOS_DEBUG("want %d start_idx %d start_count %d offset %d active %d count %d\n",
+                 stripe_count - 1, lqr->lqr_start_idx, lqr->lqr_start_count,
+                 lqr->lqr_offset_idx, pool->op_count, pool->op_count);
+
+       for (i = 0; i < pool->op_count && stripe_idx < stripe_count; i++) {
+               pool_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) %
+                           pool->op_count;
+               ++lqr->lqr_start_idx;
+               mdt_idx = lqr->lqr_pool.op_array[pool_idx];
+               mdt = LTD_TGT(ltd, mdt_idx);
+
+               QOS_DEBUG("#%d strt %d act %d strp %d ary %d idx %d\n",
+                         i, lqr->lqr_start_idx, /* XXX: active*/ 0,
+                         stripe_idx, pool_idx, mdt_idx);
+
+               if (mdt_idx == LOV_QOS_EMPTY ||
+                   !cfs_bitmap_check(ltd->ltd_tgt_bitmap, mdt_idx))
+                       continue;
+
+               /* do not put >1 objects on one MDT */
+               if (lod_qos_is_tgt_used(env, mdt_idx, stripe_idx))
+                       continue;
+
+               rc = lod_is_tgt_usable(ltd, mdt);
+               if (rc) {
+                       if (mdt->ltd_connecting)
+                               tgt_connecting = 1;
+                       continue;
+               }
+
+               /* try to use another OSP if this one is degraded */
+               if (mdt->ltd_statfs.os_state & OS_STATE_DEGRADED &&
+                   !use_degraded) {
+                       QOS_DEBUG("#%d: degraded\n", mdt_idx);
+                       continue;
+               }
+               spin_unlock(&lqr->lqr_alloc);
+
+               rc = obd_fid_alloc(env, mdt->ltd_exp, &fid, NULL);
+               if (rc) {
+                       QOS_DEBUG("#%d: alloc FID failed: %dl\n", mdt_idx, rc);
+                       spin_lock(&lqr->lqr_alloc);
+                       continue;
+               }
+
+               dto = dt_locate_at(env, mdt->ltd_tgt, &fid,
+                               lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
+                               &conf);
+
+               spin_lock(&lqr->lqr_alloc);
+               if (IS_ERR(dto)) {
+                       QOS_DEBUG("can't alloc stripe on #%u: %d\n",
+                                 mdt->ltd_index, (int) PTR_ERR(dto));
+
+                       if (mdt->ltd_connecting)
+                               tgt_connecting = 1;
+                       continue;
+               }
+
+               lod_qos_tgt_in_use(env, stripe_idx, mdt_idx);
+               stripe[stripe_idx] = dto;
+               stripe_idx++;
+       }
+
+       if (!use_degraded && stripe_idx < stripe_count) {
+               /* Try again, allowing slower OSCs */
+               use_degraded = true;
+               lqr->lqr_start_idx = start_idx_temp;
+
+               tgt_connecting = 0;
+               goto repeat_find;
+       }
+       spin_unlock(&lqr->lqr_alloc);
+       up_read(&ltd->ltd_qos.lq_rw_sem);
+
+       if (stripe_idx > 1)
+               /* at least one stripe is allocated */
+               RETURN(stripe_idx);
+
+       /* nobody provided us with a single object */
+       if (tgt_connecting)
+               RETURN(-EINPROGRESS);
+
+       RETURN(-ENOSPC);
+}
+
+/**
  * Allocate a specific striping layout on a user defined set of OSTs.
  *
  * Allocates new striping using the OST index range provided by the data from
@@ -865,7 +1047,6 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
 {
        struct lod_layout_component *lod_comp;
        struct lod_device       *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
-       struct obd_statfs       *sfs = &lod_env_info(env)->lti_osfs;
        struct dt_object        *o;
        unsigned int            array_idx = 0;
        int                     stripe_count = 0;
@@ -879,7 +1060,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
        LASSERT(lod_comp->llc_ostlist.op_array);
        LASSERT(lod_comp->llc_ostlist.op_count);
 
-       rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count);
+       rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count);
        if (rc < 0)
                RETURN(rc);
 
@@ -913,14 +1094,14 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
                /* do not put >1 objects on a single OST, except for
                 * overstriping
                 */
-               if (lod_qos_is_ost_used(env, ost_idx, stripe_count) &&
+               if (lod_qos_is_tgt_used(env, ost_idx, stripe_count) &&
                    !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)) {
                        rc = -EINVAL;
                        break;
                }
 
-               rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, ost_idx,
-                                         sfs);
+               rc = lod_statfs_and_check(env, m, &m->lod_ost_descs,
+                                         LTD_TGT(&m->lod_ost_descs, ost_idx));
                if (rc < 0) /* this OSP doesn't feel well */
                        break;
 
@@ -936,7 +1117,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
                /*
                 * We've successfully declared (reserved) an object
                 */
-               lod_qos_ost_in_use(env, stripe_count, ost_idx);
+               lod_qos_tgt_in_use(env, stripe_count, ost_idx);
                stripe[stripe_count] = o;
                ost_indices[stripe_count] = ost_idx;
                stripe_count++;
@@ -971,14 +1152,15 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
  * \retval -EINVAL     requested offset is invalid
  * \retval negative    errno on failure
  */
-static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo,
-                             struct dt_object **stripe, __u32 *ost_indices,
-                             int flags, struct thandle *th, int comp_idx)
+static int lod_ost_alloc_specific(const struct lu_env *env,
+                                 struct lod_object *lo,
+                                 struct dt_object **stripe, __u32 *ost_indices,
+                                 int flags, struct thandle *th, int comp_idx)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
-       struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
        struct dt_object *o;
+       struct lu_tgt_desc *tgt;
        __u32 ost_idx;
        unsigned int i, array_idx, ost_count;
        int rc, stripe_num = 0;
@@ -992,7 +1174,7 @@ static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo,
        LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL);
        lod_comp = &lo->ldo_comp_entries[comp_idx];
 
-       rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count);
+       rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count);
        if (rc)
                GOTO(out, rc);
 
@@ -1044,7 +1226,7 @@ repeat_find:
                 * do not put >1 objects on a single OST, except for
                 * overstriping, where it is intended
                 */
-               if (lod_qos_is_ost_used(env, ost_idx, stripe_num)) {
+               if (lod_qos_is_tgt_used(env, ost_idx, stripe_num)) {
                        if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
                                overstriped = true;
                        else
@@ -1058,14 +1240,15 @@ repeat_find:
                    lod_comp_is_ost_used(env, lo, ost_idx))
                        continue;
 
+               tgt = LTD_TGT(&m->lod_ost_descs, ost_idx);
+
                /* Drop slow OSCs if we can, but not for requested start idx.
                 *
                 * This means "if OSC is slow and it is not the requested
                 * start OST, then it can be skipped, otherwise skip it only
                 * if it is inactive/recovering/out-of-space." */
 
-               rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, ost_idx,
-                                         sfs);
+               rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, tgt);
                if (rc) {
                        /* this OSP doesn't feel well */
                        continue;
@@ -1076,7 +1259,7 @@ repeat_find:
                 * iteration.  Skip OSPs with no objects ready.  Don't apply
                 * this logic to OST specified with stripe_offset.
                 */
-               if (i != 0 && sfs->os_fprecreated == 0 && speed == 0)
+               if (i && !tgt->ltd_statfs.os_fprecreated && !speed)
                        continue;
 
                o = lod_qos_declare_object_on(env, m, ost_idx, th);
@@ -1089,7 +1272,7 @@ repeat_find:
                /*
                 * We've successfully declared (reserved) an object
                 */
-               lod_qos_ost_in_use(env, stripe_num, ost_idx);
+               lod_qos_tgt_in_use(env, stripe_num, ost_idx);
                stripe[stripe_num] = o;
                ost_indices[stripe_num] = ost_idx;
                stripe_num++;
@@ -1164,13 +1347,12 @@ out:
  * \retval -EINVAL     requested OST index is invalid
  * \retval negative    errno on failure
  */
-static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
-                        struct dt_object **stripe, __u32 *ost_indices,
-                        int flags, struct thandle *th, int comp_idx)
+static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
+                            struct dt_object **stripe, __u32 *ost_indices,
+                            int flags, struct thandle *th, int comp_idx)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
-       struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
        struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
        struct lod_tgt_desc *ost;
        struct dt_object *o;
@@ -1223,7 +1405,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
        if (rc)
                GOTO(out, rc);
 
-       rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count);
+       rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count);
        if (rc)
                GOTO(out, rc);
 
@@ -1236,18 +1418,18 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                ost = OST_TGT(lod, osts->op_array[i]);
                ost->ltd_qos.ltq_usable = 0;
 
-               rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs,
-                                         osts->op_array[i], sfs);
+               rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost);
                if (rc) {
                        /* this OSP doesn't feel well */
                        continue;
                }
 
-               if (sfs->os_state & OS_STATE_DEGRADED)
+               if (ost->ltd_statfs.os_state & OS_STATE_DEGRADED)
                        continue;
 
                /* Fail Check before osc_precreate() is called
-                  so we can only 'fail' single OSC. */
+                * so we can only 'fail' single OSC.
+                */
                if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) &&
                                   osts->op_array[i] == 0)
                        continue;
@@ -1281,7 +1463,8 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                rand = lu_prandom_u64_max(total_weight);
 
                /* On average, this will hit larger-weighted OSTs more often.
-                * 0-weight OSTs will always get used last (only when rand=0) */
+                * 0-weight OSTs will always get used last (only when rand=0)
+                */
                for (i = 0; i < osts->op_count; i++) {
                        __u32 idx = osts->op_array[i];
 
@@ -1311,7 +1494,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                            !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING))
                                continue;
 
-                       if (lod_qos_is_ost_used(env, idx, nfound)) {
+                       if (lod_qos_is_tgt_used(env, idx, nfound)) {
                                if (lod_comp->llc_pattern &
                                    LOV_PATTERN_OVERSTRIPING)
                                        overstriped = true;
@@ -1327,7 +1510,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                        }
 
                        lod_avoid_update(lo, lag);
-                       lod_qos_ost_in_use(env, nfound, idx);
+                       lod_qos_tgt_in_use(env, nfound, idx);
                        stripe[nfound] = o;
                        ost_indices[nfound] = idx;
                        ltd_qos_update(&lod->lod_ost_descs, ost, &total_weight);
@@ -1385,6 +1568,207 @@ out_nolock:
 }
 
 /**
+ * Allocate a striping using an algorithm with weights.
+ *
+ * The function allocates remote MDT objects to create a striping, the first
+ * object was already allocated on current MDT to ensure master object and
+ * the first object are on the same MDT. The algorithm used is based on weights
+ * (both free space and inodes), and it's trying to ensure the space/inodes are
+ * used evenly by MDTs and MDSs. The striping configuration (# of stripes,
+ * offset, pool) is taken from the object and is prepared by the caller.
+ *
+ * If prepared configuration can't be met due to too few MDTs, then allocation
+ * fails.
+ *
+ * No concurrent allocation is allowed on the object and this must be ensured
+ * by the caller. All the internal structures are protected by the function.
+ *
+ * The algorithm has two steps: find available MDTs and calculate their
+ * weights, then select the MDTs with their weights used as the probability.
+ * An MDT with a higher weight is proportionately more likely to be selected
+ * than one with a lower weight.
+ *
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripes         striping created
+ *
+ * \retval positive    stripes allocated, and it should be equal to
+ *                     lo->ldo_dir_stripe_count
+ * \retval -EAGAIN     not enough tgts are found for specified stripe count
+ * \retval -EINVAL     requested MDT index is invalid
+ * \retval negative    errno on failure
+ */
+int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo,
+                     struct dt_object **stripes)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
+       struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
+       struct lu_fid fid = { 0 };
+       const struct lu_tgt_pool *pool;
+       struct lu_tgt_desc *mdt;
+       struct dt_object *dto;
+       u64 total_weight = 0;
+       u32 stripe_count = lo->ldo_dir_stripe_count;
+       unsigned int nfound;
+       unsigned int good_mdts;
+       unsigned int i;
+       int rc = 0;
+
+       ENTRY;
+
+       if (stripe_count == 1)
+               RETURN(1);
+
+       pool = &ltd->ltd_tgt_pool;
+
+       /* Detect -EAGAIN early, before expensive lock is taken. */
+       if (!ltd_qos_is_usable(ltd))
+               RETURN(-EAGAIN);
+
+       /* Do actual allocation, use write lock here. */
+       down_write(&ltd->ltd_qos.lq_rw_sem);
+
+       /*
+        * Check again, while we were sleeping on @lq_rw_sem things could
+        * change.
+        */
+       if (!ltd_qos_is_usable(ltd))
+               GOTO(unlock, rc = -EAGAIN);
+
+       rc = ltd_qos_penalties_calc(ltd);
+       if (rc)
+               GOTO(unlock, rc);
+
+       rc = lod_qos_tgt_in_use_clear(env, stripe_count);
+       if (rc)
+               GOTO(unlock, rc);
+
+       good_mdts = 0;
+       /* Find all the tgts that are valid stripe candidates */
+       for (i = 0; i < pool->op_count; i++) {
+               if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, pool->op_array[i]))
+                       continue;
+
+               mdt = LTD_TGT(ltd, pool->op_array[i]);
+               mdt->ltd_qos.ltq_usable = 0;
+
+               rc = lod_is_tgt_usable(ltd, mdt);
+               if (rc)
+                       continue;
+
+               if (mdt->ltd_statfs.os_state & OS_STATE_DEGRADED)
+                       continue;
+
+               mdt->ltd_qos.ltq_usable = 1;
+               lu_tgt_qos_weight_calc(mdt);
+               total_weight += mdt->ltd_qos.ltq_weight;
+
+               good_mdts++;
+       }
+
+       QOS_DEBUG("found %d good tgts\n", good_mdts);
+
+       if (good_mdts < stripe_count - 1)
+               GOTO(unlock, rc = -EAGAIN);
+
+       /* Find enough tgts with weighted random allocation. */
+       nfound = 1;
+       while (nfound < stripe_count) {
+               u64 rand, cur_weight;
+
+               cur_weight = 0;
+               rc = -ENOSPC;
+
+               rand = lu_prandom_u64_max(total_weight);
+
+               /* On average, this will hit larger-weighted tgts more often.
+                * 0-weight tgts will always get used last (only when rand=0) */
+               for (i = 0; i < pool->op_count; i++) {
+                       __u32 idx = pool->op_array[i];
+                       int rc2;
+
+                       mdt = LTD_TGT(ltd, idx);
+
+                       if (!mdt->ltd_qos.ltq_usable)
+                               continue;
+
+                       cur_weight += mdt->ltd_qos.ltq_weight;
+
+                       QOS_DEBUG("idx=%d nfound=%d cur_weight=%llu rand=%llu total_weight=%llu\n",
+                                 idx, nfound, cur_weight, rand,
+                                 total_weight);
+
+                       if (cur_weight < rand)
+                               continue;
+
+                       QOS_DEBUG("stripe=%d to idx=%d\n", nfound, idx);
+
+                       if (lod_qos_is_tgt_used(env, idx, nfound))
+                               continue;
+
+                       rc2 = obd_fid_alloc(env, mdt->ltd_exp, &fid, NULL);
+                       if (rc2) {
+                               QOS_DEBUG("can't alloc FID on #%u: %d\n",
+                                         idx, rc2);
+                               continue;
+                       }
+
+                       conf.loc_flags = LOC_F_NEW;
+                       dto = dt_locate_at(env, mdt->ltd_tgt, &fid,
+                               lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
+                               &conf);
+                       if (IS_ERR(dto)) {
+                               QOS_DEBUG("can't alloc stripe on #%u: %d\n",
+                                         idx, (int) PTR_ERR(dto));
+                               continue;
+                       }
+
+                       lod_qos_tgt_in_use(env, nfound, idx);
+                       stripes[nfound] = dto;
+                       ltd_qos_update(ltd, mdt, &total_weight);
+                       nfound++;
+                       rc = 0;
+                       break;
+               }
+
+               /* no MDT found on this iteration, give up */
+               if (rc)
+                       break;
+       }
+
+       if (unlikely(nfound != stripe_count)) {
+               /*
+                * when the decision to use weighted algorithm was made
+                * we had enough appropriate OSPs, but this state can
+                * change anytime (no space on MDT, broken connection, etc)
+                * so it's possible OSP won't be able to provide us with
+                * an object due to just changed state
+                */
+               QOS_DEBUG("%s: wanted %d objects, found only %d\n",
+                         lod2obd(lod)->obd_name, stripe_count, nfound);
+               for (i = 1; i < nfound; i++) {
+                       LASSERT(stripes[i] != NULL);
+                       dt_object_put(env, stripes[i]);
+                       stripes[i] = NULL;
+               }
+
+               /* makes sense to rebalance next time */
+               ltd->ltd_qos.lq_dirty = 1;
+               ltd->ltd_qos.lq_same_space = 0;
+
+               rc = -EAGAIN;
+       } else {
+               rc = nfound;
+       }
+
+unlock:
+       up_write(&ltd->ltd_qos.lq_rw_sem);
+
+       RETURN(rc);
+}
+
+/**
  * Check stripe count the caller can use.
  *
  * For new layouts (no initialized components), check the total size of the
@@ -2041,7 +2425,7 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
                 * statfs and check OST targets now, since ld_active_tgt_count
                 * could be changed if some OSTs are [de]activated manually.
                 */
-               lod_qos_statfs_update(env, d);
+               lod_qos_statfs_update(env, d, &d->lod_ost_descs);
                stripe_len = lod_get_stripe_count(d, lo,
                                                  lod_comp->llc_stripe_count,
                                                  lod_comp->llc_pattern &
@@ -2079,14 +2463,16 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
                                  comp_idx);
                        lod_collect_avoidance(lo, lag, comp_idx);
 
-                       rc = lod_alloc_qos(env, lo, stripe, ost_indices, flag,
-                                          th, comp_idx);
+                       rc = lod_ost_alloc_qos(env, lo, stripe, ost_indices,
+                                              flag, th, comp_idx);
                        if (rc == -EAGAIN)
-                               rc = lod_alloc_rr(env, lo, stripe, ost_indices,
-                                                 flag, th, comp_idx);
+                               rc = lod_ost_alloc_rr(env, lo, stripe,
+                                                     ost_indices, flag, th,
+                                                     comp_idx);
                } else {
-                       rc = lod_alloc_specific(env, lo, stripe, ost_indices,
-                                               flag, th, comp_idx);
+                       rc = lod_ost_alloc_specific(env, lo, stripe,
+                                                   ost_indices, flag, th,
+                                                   comp_idx);
                }
 put_ldts:
                lod_putref(d, &d->lod_ost_descs);
index cccc6aa..8297ae3 100644 (file)
 #ifdef CONFIG_PROC_FS
 
 /**
- * Show default stripe size.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
+ * Show DoM default stripe size.
  */
-static int lod_dom_stripesize_seq_show(struct seq_file *m, void *v)
+static ssize_t dom_stripesize_show(struct kobject *kobj, struct attribute *attr,
+                                  char *buf)
 {
-       struct obd_device *dev = m->private;
-       struct lod_device *lod;
+       struct dt_device *dt = container_of(kobj, struct dt_device,
+                                           dd_kobj);
+       struct lod_device *lod = dt2lod_dev(dt);
 
-       LASSERT(dev != NULL);
-       lod = lu2lod_dev(dev->obd_lu_dev);
-       seq_printf(m, "%u\n", lod->lod_dom_max_stripesize);
-       return 0;
+       return snprintf(buf, PAGE_SIZE, "%u\n", lod->lod_dom_max_stripesize);
 }
 
 /**
- * Set default stripe size.
- *
- * \param[in] file     proc file
- * \param[in] buffer   string containing the maximum number of bytes stored in
- *                     each object before moving to the next object in the
- *                     layout (if any)
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code if failed
+ * Set DoM default stripe size.
  */
-static ssize_t
-lod_dom_stripesize_seq_write(struct file *file, const char __user *buffer,
-                             size_t count, loff_t *off)
+static ssize_t dom_stripesize_store(struct kobject *kobj,
+                                   struct attribute *attr, const char *buffer,
+                                   size_t count)
 {
-       struct seq_file *m = file->private_data;
-       struct obd_device *dev = m->private;
-       struct lod_device *lod;
+       struct dt_device *dt = container_of(kobj, struct dt_device,
+                                           dd_kobj);
+       struct lod_device *lod = dt2lod_dev(dt);
+       char tbuf[22] = "";
        s64 val;
        int rc;
 
-       LASSERT(dev != NULL);
-       lod = lu2lod_dev(dev->obd_lu_dev);
-       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+       if (count > (sizeof(tbuf) - 1))
+               return -EINVAL;
+
+       memcpy(tbuf, buffer, count);
+
+       rc = lu_str_to_s64(tbuf, count, &val, '1');
        if (rc)
                return rc;
+
        if (val < 0)
                return -ERANGE;
 
        /* 1GB is the limit */
        if (val > (1ULL << 30))
                return -ERANGE;
-       else if (val > 0) {
+
+       if (val > 0) {
                if (val < LOV_MIN_STRIPE_SIZE) {
                        LCONSOLE_INFO("Increasing provided stripe size to "
                                      "a minimum value %u\n",
@@ -117,57 +106,39 @@ lod_dom_stripesize_seq_write(struct file *file, const char __user *buffer,
 
        return count;
 }
-LPROC_SEQ_FOPS(lod_dom_stripesize);
 
-/**
- * Show default stripe size.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
- */
-static int lod_stripesize_seq_show(struct seq_file *m, void *v)
+LUSTRE_RW_ATTR(dom_stripesize);
+
+static ssize_t stripesize_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
 {
-       struct obd_device *dev = m->private;
-       struct lod_device *lod;
+       struct dt_device *dt = container_of(kobj, struct dt_device,
+                                           dd_kobj);
+       struct lod_device *lod = dt2lod_dev(dt);
 
-       LASSERT(dev != NULL);
-       lod  = lu2lod_dev(dev->obd_lu_dev);
-       seq_printf(m, "%llu\n",
-                  lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_size);
-       return 0;
+       return snprintf(buf, PAGE_SIZE, "%llu\n",
+                       lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_size);
 }
 
-/**
- * Set default stripe size.
- *
- * \param[in] file     proc file
- * \param[in] buffer   string containing the maximum number of bytes stored in
- *                     each object before moving to the next object in the
- *                     layout (if any)
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code if failed
- */
-static ssize_t
-lod_stripesize_seq_write(struct file *file, const char __user *buffer,
-                        size_t count, loff_t *off)
+static ssize_t stripesize_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buffer, size_t count)
 {
-       struct seq_file *m = file->private_data;
-       struct obd_device *dev = m->private;
-       struct lod_device *lod;
+       struct dt_device *dt = container_of(kobj, struct dt_device,
+                                           dd_kobj);
+       struct lod_device *lod = dt2lod_dev(dt);
+       char tbuf[22] = "";
        s64 val;
        int rc;
 
-       LASSERT(dev != NULL);
-       lod  = lu2lod_dev(dev->obd_lu_dev);
-       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1');
+       if (count > (sizeof(tbuf) - 1))
+               return -EINVAL;
+
+       memcpy(tbuf, buffer, count);
+
+       rc = lu_str_to_s64(tbuf, count, &val, '1');
        if (rc)
                return rc;
+
        if (val < 0)
                return -ERANGE;
 
@@ -176,16 +147,11 @@ lod_stripesize_seq_write(struct file *file, const char __user *buffer,
 
        return count;
 }
-LPROC_SEQ_FOPS(lod_stripesize);
+
+LUSTRE_RW_ATTR(stripesize);
 
 /**
  * Show default stripe offset.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
 static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr,
                                 char *buf)
@@ -194,7 +160,7 @@ static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
 
-       return sprintf(buf, "%lld\n",
+       return snprintf(buf, PAGE_SIZE, "%lld\n",
                lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_offset);
 }
 
@@ -203,17 +169,10 @@ static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr,
  *
  * Usually contains -1 allowing Lustre to balance objects among OST
  * otherwise may cause severe OST imbalance.
- *
- * \param[in] file     proc file
- * \param[in] buffer   string describing starting OST index for new files
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code if failed
  */
-static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr,
-                                 const char *buffer, size_t count)
+static ssize_t stripeoffset_store(struct kobject *kobj,
+                                   struct attribute *attr,
+                                   const char *buffer, size_t count)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
@@ -232,45 +191,47 @@ static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr,
 
        return count;
 }
+
 LUSTRE_RW_ATTR(stripeoffset);
 
 /**
  * Show default striping pattern (LOV_PATTERN_*).
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
-static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
+static ssize_t __stripetype_show(struct kobject *kobj, struct attribute *attr,
+                                char *buf, bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
+
+       return snprintf(buf, PAGE_SIZE, "%u\n", ltd->ltd_lov_desc.ld_pattern);
+}
 
-       return sprintf(buf, "%u\n", lod->lod_ost_descs.ltd_lov_desc.ld_pattern);
+static ssize_t mdt_stripetype_show(struct kobject *kobj, struct attribute *attr,
+                                  char *buf)
+{
+       return __stripetype_show(kobj, attr, buf, true);
+}
+
+static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       return __stripetype_show(kobj, attr, buf, false);
 }
 
 /**
  * Set default striping pattern (a number, not a human-readable string).
- *
- * \param[in] file     proc file
- * \param[in] buffer   string containing the default striping pattern for new
- *                     files. This is an integer LOV_PATTERN_* value
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code if failed
  */
-static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr,
-                               const char *buffer, size_t count)
+static ssize_t __stripetype_store(struct kobject *kobj, struct attribute *attr,
+                                 const char *buffer, size_t count, bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
        u32 pattern;
        int rc;
 
@@ -278,52 +239,73 @@ static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr,
        if (rc)
                return rc;
 
-       lod_fix_desc_pattern(&pattern);
-       lod->lod_ost_descs.ltd_lov_desc.ld_pattern = pattern;
+       if (is_mdt)
+               lod_fix_lmv_desc_pattern(&pattern);
+       else
+               lod_fix_desc_pattern(&pattern);
+
+       ltd->ltd_lov_desc.ld_pattern = pattern;
 
        return count;
 }
+
+static ssize_t mdt_stripetype_store(struct kobject *kobj,
+                                   struct attribute *attr, const char *buffer,
+                                   size_t count)
+{
+       return __stripetype_store(kobj, attr, buffer, count, true);
+}
+
+static ssize_t stripetype_store(struct kobject *kobj,
+                                   struct attribute *attr, const char *buffer,
+                                   size_t count)
+{
+       return __stripetype_store(kobj, attr, buffer, count, false);
+}
+
+LUSTRE_RW_ATTR(mdt_stripetype);
 LUSTRE_RW_ATTR(stripetype);
 
 /**
  * Show default number of stripes.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success,
- * \retval negative    error code if failed
  */
-static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr,
-                               char *buf)
+static ssize_t __stripecount_show(struct kobject *kobj, struct attribute *attr,
+                                 char *buf, bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
-       struct lov_desc *desc = &lod->lod_ost_descs.ltd_lov_desc;
+       struct lov_desc *desc = is_mdt ? &lod->lod_mdt_descs.ltd_lov_desc :
+                                        &lod->lod_ost_descs.ltd_lov_desc;
 
-       return sprintf(buf, "%d\n",
+       return snprintf(buf, PAGE_SIZE, "%d\n",
                      (s16)(desc->ld_default_stripe_count + 1) - 1);
 }
 
+static ssize_t mdt_stripecount_show(struct kobject *kobj,
+                                   struct attribute *attr, char *buf)
+{
+       return __stripecount_show(kobj, attr, buf, true);
+}
+
+static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr,
+                               char *buf)
+{
+       return __stripecount_show(kobj, attr, buf, false);
+}
+
 /**
  * Set default number of stripes.
- *
- * \param[in] file     proc file
- * \param[in] buffer   string containing the default number of stripes
- *                     for new files
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code otherwise
  */
-static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr,
-                                 const char *buffer, size_t count)
+static ssize_t __stripecount_store(struct kobject *kobj, struct attribute *attr,
+                                  const char *buffer, size_t count,
+                                  bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
        int stripe_count;
        int rc;
 
@@ -335,61 +317,91 @@ static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr,
                return -ERANGE;
 
        lod_fix_desc_stripe_count(&stripe_count);
-       lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_count = stripe_count;
+       ltd->ltd_lov_desc.ld_default_stripe_count = stripe_count;
 
        return count;
 }
+
+static ssize_t mdt_stripecount_store(struct kobject *kobj,
+                                    struct attribute *attr,
+                                    const char *buffer, size_t count)
+{
+       return __stripecount_store(kobj, attr, buffer, count, true);
+}
+
+static ssize_t stripecount_store(struct kobject *kobj,
+                                struct attribute *attr,
+                                const char *buffer, size_t count)
+{
+       return __stripecount_store(kobj, attr, buffer, count, false);
+}
+
+LUSTRE_RW_ATTR(mdt_stripecount);
 LUSTRE_RW_ATTR(stripecount);
 
 /**
  * Show number of targets.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
-static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
-                          char *buf)
+static ssize_t __numobd_show(struct kobject *kobj, struct attribute *attr,
+                            char *buf, bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
+
+       return snprintf(buf, PAGE_SIZE, "%u\n", ltd->ltd_lov_desc.ld_tgt_count);
+}
+
+static ssize_t mdt_numobd_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       return __numobd_show(kobj, attr, buf, true);
+}
 
-       return sprintf(buf, "%u\n", lod->lod_ost_count);
+static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr,
+                          char *buf)
+{
+       return __numobd_show(kobj, attr, buf, false);
 }
+
+LUSTRE_RO_ATTR(mdt_numobd);
 LUSTRE_RO_ATTR(numobd);
 
 /**
  * Show number of active targets.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
-static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
-                             char *buf)
+static ssize_t __activeobd_show(struct kobject *kobj, struct attribute *attr,
+                               char *buf, bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
+
+       return snprintf(buf, PAGE_SIZE, "%u\n",
+                       ltd->ltd_lov_desc.ld_active_tgt_count);
+}
 
-       return sprintf(buf, "%u\n",
-                      lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count);
+static ssize_t mdt_activeobd_show(struct kobject *kobj, struct attribute *attr,
+                                 char *buf)
+{
+       return __activeobd_show(kobj, attr, buf, true);
 }
+
+static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr,
+                             char *buf)
+{
+       return __activeobd_show(kobj, attr, buf, false);
+}
+
+LUSTRE_RO_ATTR(mdt_activeobd);
 LUSTRE_RO_ATTR(activeobd);
 
 /**
  * Show UUID of LOD device.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
 static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
                              char *buf)
@@ -398,7 +410,7 @@ static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
 
-       return sprintf(buf, "%s\n",
+       return snprintf(buf, PAGE_SIZE, "%s\n",
                       lod->lod_ost_descs.ltd_lov_desc.ld_uuid.uuid);
 }
 LUSTRE_RO_ATTR(desc_uuid);
@@ -410,23 +422,31 @@ LUSTRE_RO_ATTR(desc_uuid);
  * of free space compared to performance. 0% means select OSTs equally
  * regardless of their free space, 100% means select OSTs only by their free
  * space even if it results in very imbalanced load on the OSTs.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
-static ssize_t qos_prio_free_show(struct kobject *kobj, struct attribute *attr,
-                                 char *buf)
+static ssize_t __qos_prio_free_show(struct kobject *kobj,
+                                   struct attribute *attr, char *buf,
+                                   bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
+
+       return snprintf(buf, PAGE_SIZE, "%d%%\n",
+                      (ltd->ltd_qos.lq_prio_free * 100 + 255) >> 8);
+}
 
-       return sprintf(buf, "%d%%\n",
-                      (lod->lod_ost_descs.ltd_qos.lq_prio_free * 100 + 255) >>
-                               8);
+static ssize_t mdt_qos_prio_free_show(struct kobject *kobj,
+                                     struct attribute *attr, char *buf)
+{
+       return __qos_prio_free_show(kobj, attr, buf, true);
+}
+
+static ssize_t qos_prio_free_show(struct kobject *kobj,
+                                 struct attribute *attr, char *buf)
+{
+       return __qos_prio_free_show(kobj, attr, buf, false);
 }
 
 /**
@@ -436,21 +456,17 @@ static ssize_t qos_prio_free_show(struct kobject *kobj, struct attribute *attr,
  * are space imbalanced.  See lod_qos_priofree_seq_show() for description of
  * this parameter.  See lod_qos_thresholdrr_seq_write() and lq_threshold_rr to
  * determine what constitutes "space imbalanced" OSTs.
- *
- * \param[in] file     proc file
- * \param[in] buffer   string which contains the free space priority (0-100)
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code if failed
  */
-static ssize_t qos_prio_free_store(struct kobject *kobj, struct attribute *attr,
-                                  const char *buffer, size_t count)
+static ssize_t __qos_prio_free_store(struct kobject *kobj,
+                                    struct attribute *attr,
+                                    const char *buffer, size_t count,
+                                    bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
        unsigned int val;
        int rc;
 
@@ -460,34 +476,56 @@ static ssize_t qos_prio_free_store(struct kobject *kobj, struct attribute *attr,
 
        if (val > 100)
                return -EINVAL;
-       lod->lod_ost_descs.ltd_qos.lq_prio_free = (val << 8) / 100;
-       lod->lod_ost_descs.ltd_qos.lq_dirty = 1;
-       lod->lod_ost_descs.ltd_qos.lq_reset = 1;
+       ltd->ltd_qos.lq_prio_free = (val << 8) / 100;
+       ltd->ltd_qos.lq_dirty = 1;
+       ltd->ltd_qos.lq_reset = 1;
 
        return count;
 }
+
+static ssize_t mdt_qos_prio_free_store(struct kobject *kobj,
+                                      struct attribute *attr,
+                                      const char *buffer, size_t count)
+{
+       return __qos_prio_free_store(kobj, attr, buffer, count, true);
+}
+
+static ssize_t qos_prio_free_store(struct kobject *kobj, struct attribute *attr,
+                                  const char *buffer, size_t count)
+{
+       return __qos_prio_free_store(kobj, attr, buffer, count, false);
+}
+
+LUSTRE_RW_ATTR(mdt_qos_prio_free);
 LUSTRE_RW_ATTR(qos_prio_free);
 
 /**
  * Show threshold for "same space on all OSTs" rule.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
-static int lod_qos_thresholdrr_seq_show(struct seq_file *m, void *v)
+static ssize_t __qos_thresholdrr_show(struct kobject *kobj,
+                                   struct attribute *attr, char *buf,
+                                   bool is_mdt)
 {
-       struct obd_device *dev = m->private;
-       struct lod_device *lod;
+       struct dt_device *dt = container_of(kobj, struct dt_device,
+                                           dd_kobj);
+       struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
 
-       LASSERT(dev != NULL);
-       lod = lu2lod_dev(dev->obd_lu_dev);
-       seq_printf(m, "%d%%\n",
-                  (lod->lod_ost_descs.ltd_qos.lq_threshold_rr * 100 + 255) >>
-                       8);
-       return 0;
+       return snprintf(buf, PAGE_SIZE, "%d%%\n",
+                      (ltd->ltd_qos.lq_threshold_rr * 100 + 255) >> 8);
+}
+
+static ssize_t mdt_qos_thresholdrr_show(struct kobject *kobj,
+                                       struct attribute *attr, char *buf)
+{
+       return __qos_thresholdrr_show(kobj, attr, buf, true);
+}
+
+static ssize_t lod_qos_thresholdrr_show(struct kobject *kobj,
+                                       struct attribute *attr, char *buf)
+{
+       return __qos_thresholdrr_show(kobj, attr, buf, false);
 }
 
 /**
@@ -498,80 +536,89 @@ static int lod_qos_thresholdrr_seq_show(struct seq_file *m, void *v)
  * is exceeded, use the QoS allocator to select OSTs based on their available
  * space so that more full OSTs are chosen less often, otherwise use the
  * round-robin allocator for efficiency and performance.
-
- * \param[in] file     proc file
- * \param[in] buffer   string containing percentage difference of free space
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code if failed
  */
-static ssize_t
-lod_qos_thresholdrr_seq_write(struct file *file, const char __user *buffer,
-                             size_t count, loff_t *off)
+static ssize_t __qos_thresholdrr_store(struct kobject *kobj,
+                                      struct attribute *attr,
+                                      const char *buffer, size_t count,
+                                      bool is_mdt)
 {
-       struct seq_file *m = file->private_data;
-       struct obd_device *dev = m->private;
-       struct lod_device *lod;
+       struct dt_device *dt = container_of(kobj, struct dt_device,
+                                           dd_kobj);
+       struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
+       unsigned int val;
        int rc;
-       __s64 val;
 
-       LASSERT(dev != NULL);
-       lod = lu2lod_dev(dev->obd_lu_dev);
-
-       rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '%');
+       rc = kstrtouint(buffer, 0, &val);
        if (rc)
                return rc;
 
-       if (val > 100 || val < 0)
+       if (val > 100)
                return -EINVAL;
-
-       lod->lod_ost_descs.ltd_qos.lq_threshold_rr = (val << 8) / 100;
-       lod->lod_ost_descs.ltd_qos.lq_dirty = 1;
+       ltd->ltd_qos.lq_threshold_rr = (val << 8) / 100;
+       ltd->ltd_qos.lq_dirty = 1;
 
        return count;
 }
-LPROC_SEQ_FOPS(lod_qos_thresholdrr);
+
+static ssize_t mdt_qos_thresholdrr_store(struct kobject *kobj,
+                                        struct attribute *attr,
+                                        const char *buffer, size_t count)
+{
+       return __qos_thresholdrr_store(kobj, attr, buffer, count, true);
+}
+
+static ssize_t lod_qos_thresholdrr_store(struct kobject *kobj,
+                                        struct attribute *attr,
+                                        const char *buffer, size_t count)
+{
+       return __qos_thresholdrr_store(kobj, attr, buffer, count, false);
+}
+
+LUSTRE_RW_ATTR(mdt_qos_thresholdrr);
+LUSTRE_RW_ATTR(lod_qos_thresholdrr);
 
 /**
  * Show expiration period used to refresh cached statfs data, which
  * is used to implement QoS/RR striping allocation algorithm.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
-static ssize_t qos_maxage_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
+static ssize_t __qos_maxage_show(struct kobject *kobj, struct attribute *attr,
+                                char *buf, bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
 
-       return sprintf(buf, "%u Sec\n",
-                      lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage);
+       return snprintf(buf, PAGE_SIZE, "%u Sec\n",
+                      ltd->ltd_lov_desc.ld_qos_maxage);
+}
+
+static ssize_t mdt_qos_maxage_show(struct kobject *kobj, struct attribute *attr,
+                                  char *buf)
+{
+       return __qos_maxage_show(kobj, attr, buf, true);
+}
+
+static ssize_t qos_maxage_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+       return __qos_maxage_show(kobj, attr, buf, true);
 }
 
 /**
  * Set expiration period used to refresh cached statfs data.
- *
- * \param[in] file     proc file
- * \param[in] buffer   string contains maximum age of statfs data in seconds
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code if failed
  */
-static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr,
-                               const char *buffer, size_t count)
+static ssize_t __qos_maxage_store(struct kobject *kobj, struct attribute *attr,
+                                 const char *buffer, size_t count, bool is_mdt)
 {
        struct dt_device *dt = container_of(kobj, struct dt_device,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
        struct lustre_cfg_bufs bufs;
        struct lu_device *next;
        struct lustre_cfg *lcfg;
@@ -586,7 +633,8 @@ static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr,
 
        if (val <= 0)
                return -EINVAL;
-       lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage = val;
+
+       ltd->ltd_lov_desc.ld_qos_maxage = val;
 
        /*
         * propogate the value down to OSPs
@@ -599,67 +647,117 @@ static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr,
                return -ENOMEM;
        lustre_cfg_init(lcfg, LCFG_PARAM, &bufs);
 
-       lod_getref(&lod->lod_ost_descs);
-       lod_foreach_ost(lod, tgt) {
+       lod_getref(ltd);
+       ltd_foreach_tgt(ltd, tgt) {
                next = &tgt->ltd_tgt->dd_lu_dev;
                rc = next->ld_ops->ldo_process_config(NULL, next, lcfg);
                if (rc)
                        CERROR("can't set maxage on #%d: %d\n",
                               tgt->ltd_index, rc);
        }
-       lod_putref(lod, &lod->lod_ost_descs);
+       lod_putref(lod, ltd);
        OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens));
 
        return count;
 }
+
+static ssize_t mdt_qos_maxage_store(struct kobject *kobj,
+                                   struct attribute *attr,
+                                   const char *buffer, size_t count)
+{
+       return __qos_maxage_store(kobj, attr, buffer, count, true);
+}
+
+static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buffer, size_t count)
+{
+       return __qos_maxage_store(kobj, attr, buffer, count, false);
+}
+
+LUSTRE_RW_ATTR(mdt_qos_maxage);
 LUSTRE_RW_ATTR(qos_maxage);
 
-static void *lod_osts_seq_start(struct seq_file *p, loff_t *pos)
+static void *lod_tgts_seq_start(struct seq_file *p, loff_t *pos, bool is_mdt)
 {
        struct obd_device *dev = p->private;
-       struct lod_device *lod;
+       struct lod_device *lod = lu2lod_dev(dev->obd_lu_dev);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
 
        LASSERT(dev != NULL);
-       lod = lu2lod_dev(dev->obd_lu_dev);
 
-       lod_getref(&lod->lod_ost_descs); /* released in lod_osts_seq_stop */
-       if (*pos >= lod->lod_ost_bitmap->size)
+       lod_getref(ltd); /* released in lod_tgts_seq_stop */
+       if (*pos >= ltd->ltd_tgt_bitmap->size)
                return NULL;
 
-       *pos = find_next_bit(lod->lod_ost_bitmap->data,
-                                lod->lod_ost_bitmap->size, *pos);
-       if (*pos < lod->lod_ost_bitmap->size)
-               return OST_TGT(lod,*pos);
+       *pos = find_next_bit(ltd->ltd_tgt_bitmap->data,
+                            ltd->ltd_tgt_bitmap->size, *pos);
+       if (*pos < ltd->ltd_tgt_bitmap->size)
+               return LTD_TGT(ltd, *pos);
        else
                return NULL;
 }
 
-static void lod_osts_seq_stop(struct seq_file *p, void *v)
+static void *lod_mdts_seq_start(struct seq_file *p, loff_t *pos)
+{
+       return lod_tgts_seq_start(p, pos, true);
+}
+
+static void *lod_osts_seq_start(struct seq_file *p, loff_t *pos)
+{
+       return lod_tgts_seq_start(p, pos, false);
+}
+
+static void lod_tgts_seq_stop(struct seq_file *p, void *v, bool is_mdt)
 {
        struct obd_device *dev = p->private;
-       struct lod_device *lod;
+       struct lod_device *lod = lu2lod_dev(dev->obd_lu_dev);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
 
        LASSERT(dev != NULL);
-       lod = lu2lod_dev(dev->obd_lu_dev);
-       lod_putref(lod, &lod->lod_ost_descs);
+       lod_putref(lod, ltd);
 }
 
-static void *lod_osts_seq_next(struct seq_file *p, void *v, loff_t *pos)
+static void lod_mdts_seq_stop(struct seq_file *p, void *v)
+{
+       lod_tgts_seq_stop(p, v, true);
+}
+
+static void lod_osts_seq_stop(struct seq_file *p, void *v)
+{
+       lod_tgts_seq_stop(p, v, false);
+}
+
+static void *lod_tgts_seq_next(struct seq_file *p, void *v, loff_t *pos,
+                              bool is_mdt)
 {
        struct obd_device *dev = p->private;
        struct lod_device *lod = lu2lod_dev(dev->obd_lu_dev);
+       struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs :
+                                           &lod->lod_ost_descs;
 
-       if (*pos >= lod->lod_ost_bitmap->size - 1)
+       if (*pos >= ltd->ltd_tgt_bitmap->size - 1)
                return NULL;
 
-       *pos = find_next_bit(lod->lod_ost_bitmap->data,
-                                lod->lod_ost_bitmap->size, *pos + 1);
-       if (*pos < lod->lod_ost_bitmap->size)
-               return OST_TGT(lod,*pos);
+       *pos = find_next_bit(ltd->ltd_tgt_bitmap->data,
+                            ltd->ltd_tgt_bitmap->size, *pos + 1);
+       if (*pos < ltd->ltd_tgt_bitmap->size)
+               return LTD_TGT(ltd, *pos);
        else
                return NULL;
 }
 
+static void *lod_mdts_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       return lod_tgts_seq_next(p, v, pos, true);
+}
+
+static void *lod_osts_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+       return lod_tgts_seq_next(p, v, pos, false);
+}
+
 /**
  * Show active/inactive status for OST found by lod_osts_seq_next().
  *
@@ -669,45 +767,62 @@ static void *lod_osts_seq_next(struct seq_file *p, void *v, loff_t *pos)
  * \retval 0           on success
  * \retval negative    error code if failed
  */
-static int lod_osts_seq_show(struct seq_file *p, void *v)
+static int lod_tgts_seq_show(struct seq_file *p, void *v)
 {
-       struct obd_device   *obd = p->private;
-       struct lu_tgt_desc *ost_desc = v;
-       struct lod_device   *lod;
-       int                  idx, rc, active;
-       struct dt_device    *next;
-       struct obd_statfs    sfs;
+       struct obd_device *obd = p->private;
+       struct lu_tgt_desc *tgt = v;
+       struct dt_device *next;
+       int rc, active;
 
        LASSERT(obd->obd_lu_dev);
-       lod = lu2lod_dev(obd->obd_lu_dev);
 
-       idx = ost_desc->ltd_index;
-       next = OST_TGT(lod, idx)->ltd_tgt;
-       if (next == NULL)
+       next = tgt->ltd_tgt;
+       if (!next)
                return -EINVAL;
 
        /* XXX: should be non-NULL env, but it's very expensive */
        active = 1;
-       rc = dt_statfs(NULL, next, &sfs);
+       rc = dt_statfs(NULL, next, &tgt->ltd_statfs);
        if (rc == -ENOTCONN) {
                active = 0;
                rc = 0;
        } else if (rc)
                return rc;
 
-       seq_printf(p, "%d: %s %sACTIVE\n", idx,
-                  obd_uuid2str(&ost_desc->ltd_uuid),
+       seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index,
+                  obd_uuid2str(&tgt->ltd_uuid),
                   active ? "" : "IN");
        return 0;
 }
 
+static const struct seq_operations lod_mdts_sops = {
+       .start  = lod_mdts_seq_start,
+       .stop   = lod_mdts_seq_stop,
+       .next   = lod_mdts_seq_next,
+       .show   = lod_tgts_seq_show,
+};
+
 static const struct seq_operations lod_osts_sops = {
        .start  = lod_osts_seq_start,
        .stop   = lod_osts_seq_stop,
        .next   = lod_osts_seq_next,
-       .show   = lod_osts_seq_show,
+       .show   = lod_tgts_seq_show,
 };
 
+static int lod_mdts_seq_open(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq;
+       int rc;
+
+       rc = seq_open(file, &lod_mdts_sops);
+       if (rc)
+               return rc;
+
+       seq = file->private_data;
+       seq->private = PDE_DATA(inode);
+       return 0;
+}
+
 static int lod_osts_seq_open(struct inode *inode, struct file *file)
 {
        struct seq_file *seq;
@@ -724,12 +839,6 @@ static int lod_osts_seq_open(struct inode *inode, struct file *file)
 
 /**
  * Show whether special failout mode for testing is enabled or not.
- *
- * \param[in] m                seq file
- * \param[in] v                unused for single entry
- *
- * \retval 0           on success
- * \retval negative    error code if failed
  */
 static ssize_t lmv_failout_show(struct kobject *kobj, struct attribute *attr,
                                char *buf)
@@ -738,7 +847,7 @@ static ssize_t lmv_failout_show(struct kobject *kobj, struct attribute *attr,
                                            dd_kobj);
        struct lod_device *lod = dt2lod_dev(dt);
 
-       return sprintf(buf, "%d\n", lod->lod_lmv_failout ? 1 : 0);
+       return snprintf(buf, PAGE_SIZE, "%d\n", lod->lod_lmv_failout ? 1 : 0);
 }
 
 /**
@@ -747,14 +856,6 @@ static ssize_t lmv_failout_show(struct kobject *kobj, struct attribute *attr,
  * This determines whether the LMV will try to continue processing a striped
  * directory even if it has a (partly) corrupted entry in the master directory,
  * or if it will abort upon finding a corrupted slave directory entry.
- *
- * \param[in] file     proc file
- * \param[in] buffer   string: 0 or non-zero to disable or enable LMV failout
- * \param[in] count    @buffer length
- * \param[in] off      unused for single entry
- *
- * \retval @count      on success
- * \retval negative    error code if failed
  */
 static ssize_t lmv_failout_store(struct kobject *kobj, struct attribute *attr,
                                 const char *buffer, size_t count)
@@ -776,15 +877,17 @@ static ssize_t lmv_failout_store(struct kobject *kobj, struct attribute *attr,
 LUSTRE_RW_ATTR(lmv_failout);
 
 static struct lprocfs_vars lprocfs_lod_obd_vars[] = {
-       { .name =       "stripesize",
-         .fops =       &lod_stripesize_fops    },
-       { .name =       "qos_threshold_rr",
-         .fops =       &lod_qos_thresholdrr_fops },
-       { .name =       "dom_stripesize",
-         .fops =       &lod_dom_stripesize_fops        },
        { NULL }
 };
 
+static const struct file_operations lod_proc_mdt_fops = {
+       .owner   = THIS_MODULE,
+       .open    = lod_mdts_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = lprocfs_seq_release,
+};
+
 static const struct file_operations lod_proc_target_fops = {
        .owner   = THIS_MODULE,
        .open    = lod_osts_seq_open,
@@ -794,6 +897,8 @@ static const struct file_operations lod_proc_target_fops = {
 };
 
 static struct attribute *lod_attrs[] = {
+       &lustre_attr_dom_stripesize.attr,
+       &lustre_attr_stripesize.attr,
        &lustre_attr_stripeoffset.attr,
        &lustre_attr_stripecount.attr,
        &lustre_attr_stripetype.attr,
@@ -803,6 +908,14 @@ static struct attribute *lod_attrs[] = {
        &lustre_attr_numobd.attr,
        &lustre_attr_qos_maxage.attr,
        &lustre_attr_qos_prio_free.attr,
+       &lustre_attr_lod_qos_thresholdrr.attr,
+       &lustre_attr_mdt_stripecount.attr,
+       &lustre_attr_mdt_stripetype.attr,
+       &lustre_attr_mdt_activeobd.attr,
+       &lustre_attr_mdt_numobd.attr,
+       &lustre_attr_mdt_qos_maxage.attr,
+       &lustre_attr_mdt_qos_prio_free.attr,
+       &lustre_attr_mdt_qos_thresholdrr.attr,
        NULL,
 };
 
@@ -842,6 +955,14 @@ int lod_procfs_init(struct lod_device *lod)
                GOTO(out, rc);
        }
 
+       rc = lprocfs_seq_create(obd->obd_proc_entry, "mdt_obd",
+                               0444, &lod_proc_mdt_fops, obd);
+       if (rc) {
+               CWARN("%s: Error adding the target_obd file %d\n",
+                     obd->obd_name, rc);
+               GOTO(out, rc);
+       }
+
        rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
                                0444, &lod_proc_target_fops, obd);
        if (rc) {
@@ -932,4 +1053,3 @@ void lod_procfs_fini(struct lod_device *lod)
 }
 
 #endif /* CONFIG_PROC_FS */
-
index 73c710e..81c148f 100644 (file)
@@ -768,19 +768,6 @@ static int mdt_reint_setattr(struct mdt_thread_info *info,
 
                        buf->lb_buf = lmu;
                        buf->lb_len = ma->ma_lmv_size;
-
-                       if (le32_to_cpu(lmu->lum_hash_type) &
-                           LMV_HASH_FLAG_SPACE) {
-                               /*
-                                * only allow setting "space" hash flag on
-                                * plain directory.
-                                */
-                               rc = mdt_object_striped(info, mo);
-                               if (rc)
-                                       GOTO(out_put,
-                                            rc = (rc == 1) ? -EPERM : rc);
-                       }
-
                        name = XATTR_NAME_DEFAULT_LMV;
                        /* force client to update dir default layout */
                        lockpart |= MDS_INODELOCK_LOOKUP;
index 11fdc78..7030e8e 100644 (file)
@@ -1989,34 +1989,31 @@ static int str_to_u64_parse(char *buffer, unsigned long count,
  * have a unit as the last character. The function handles overflow/underflow
  * of the signed integer.
  */
-static int str_to_s64_internal(const char __user *buffer, unsigned long count,
-                              __s64 *val, __u64 def_mult, bool allow_units)
+int lu_str_to_s64(char *buffer, unsigned long count, __s64 *val, char defunit)
 {
-       char kernbuf[22];
+       __u64 mult = 1;
        __u64 tmp;
        unsigned int offset = 0;
        int signed sign = 1;
        __u64 max = LLONG_MAX;
        int rc = 0;
 
-       if (count > (sizeof(kernbuf) - 1))
-               return -EINVAL;
-
-       if (copy_from_user(kernbuf, buffer, count))
-               return -EFAULT;
-
-       kernbuf[count] = '\0';
+       if (defunit != '1') {
+               rc = get_mult(defunit, &mult);
+               if (rc)
+                       return rc;
+       }
 
        /* keep track of our sign */
-       if (*kernbuf == '-') {
+       if (*buffer == '-') {
                sign = -1;
                offset++;
                /* equivalent to max = -LLONG_MIN, avoids overflow */
                max++;
        }
 
-       rc = str_to_u64_parse(kernbuf + offset, count - offset,
-                             &tmp, def_mult, allow_units);
+       rc = str_to_u64_parse(buffer + offset, count - offset,
+                             &tmp, mult, true);
        if (rc)
                return rc;
 
@@ -2028,6 +2025,7 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count,
 
        return 0;
 }
+EXPORT_SYMBOL(lu_str_to_s64);
 
 /* identical to s64 version, but does not handle overflow */
 static int str_to_u64_internal(const char __user *buffer, unsigned long count,
@@ -2072,16 +2070,17 @@ static int str_to_u64_internal(const char __user *buffer, unsigned long count,
 int lprocfs_str_with_units_to_s64(const char __user *buffer,
                                  unsigned long count, __s64 *val, char defunit)
 {
-       __u64 mult = 1;
-       int rc;
+       char kernbuf[22];
 
-       if (defunit != '1') {
-               rc = get_mult(defunit, &mult);
-               if (rc)
-                       return rc;
-       }
+       if (count > (sizeof(kernbuf) - 1))
+               return -EINVAL;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+
+       kernbuf[count] = '\0';
 
-       return str_to_s64_internal(buffer, count, val, mult, true);
+       return lu_str_to_s64(kernbuf, count, val, defunit);
 }
 EXPORT_SYMBOL(lprocfs_str_with_units_to_s64);
 
index c7d8bbe..5ffe4ea 100644 (file)
@@ -110,10 +110,6 @@ int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt)
 
        ENTRY;
 
-       /* tgt not connected, this function will be called again later */
-       if (!exp)
-               RETURN(0);
-
        down_write(&qos->lq_rw_sem);
        /*
         * a bit hacky approach to learn NID of corresponding connection
@@ -531,7 +527,7 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
                 * per-tgt penalty is
                 * prio * bavail * iavail / (num_tgt - 1) / 2
                 */
-               tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
+               tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8;
                do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active);
                tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
 
@@ -565,8 +561,9 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd)
        list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
                ba = svr->lsq_bavail;
                ia = svr->lsq_iavail;
-               svr->lsq_penalty_per_obj = prio_wide * ba  * ia;
-               do_div(ba, svr->lsq_tgt_count * num_active);
+               svr->lsq_penalty_per_obj = prio_wide * ba  * ia >> 8;
+               do_div(svr->lsq_penalty_per_obj,
+                      svr->lsq_tgt_count * num_active);
                svr->lsq_penalty_per_obj >>= 1;
 
                age = (now - svr->lsq_used) >> 3;
@@ -661,6 +658,7 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
                if (!tgt->ltd_active)
                        continue;
 
+               ltq = &tgt->ltd_qos;
                if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
                        ltq->ltq_penalty = 0;
                else
@@ -672,9 +670,10 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt,
                if (ltq->ltq_usable)
                        *total_wt += ltq->ltq_weight;
 
-               CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n",
+               CDEBUG(D_OTHER, "recalc tgt %d usable=%d bavail=%llu ffree=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n",
                          tgt->ltd_index, ltq->ltq_usable,
-                         tgt_statfs_bavail(tgt) >> 10,
+                         tgt_statfs_bavail(tgt) >> 16,
+                         tgt_statfs_iavail(tgt) >> 8,
                          ltq->ltq_penalty_per_obj >> 10,
                          ltq->ltq_penalty >> 10,
                          ltq->ltq_svr->lsq_penalty_per_obj >> 10,
index b6fa9ae..3056a0b 100644 (file)
@@ -1861,7 +1861,6 @@ void lustre_assert_wire_constants(void)
        CLASSERT(LMV_MAGIC_V1 == 0x0CD20CD0);
        CLASSERT(LMV_MAGIC_STRIPE == 0x0CD40CD0);
        CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff);
-       CLASSERT(LMV_HASH_FLAG_SPACE == 0x08000000);
        CLASSERT(LMV_HASH_FLAG_LOST_LMV == 0x10000000);
        CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000);
        CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000);
index 0484594..62ff94f 100644 (file)
@@ -20750,87 +20750,86 @@ test_412() {
 }
 run_test 412 "mkdir on specific MDTs"
 
-test_413a() {
-       [ $MDSCOUNT -lt 2 ] &&
-               skip "We need at least 2 MDTs for this test"
-
-       if [ $(lustre_version_code mds1) -lt $(version_code 2.10.55) ]; then
-               skip "Need server version at least 2.10.55"
-       fi
-
-       mkdir $DIR/$tdir || error "mkdir failed"
-
-       # find MDT that is the most full
-       local max=$($LFS df | grep MDT |
-               awk 'BEGIN { a=0 }
-                       { sub("%", "", $5)
-                         if (0+$5 >= a)
-                         {
-                               a = $5
-                               b = $6
-                         }
-                       }
-                    END { split(b, c, ":")
-                          sub("]", "", c[2])
-                          print c[2]
-                        }')
-
-       for i in $(seq $((MDSCOUNT - 1))); do
-               $LFS mkdir -c $i $DIR/$tdir/d$i ||
-                       error "mkdir d$i failed"
-               $LFS getdirstripe $DIR/$tdir/d$i
-               local stripe_index=$($LFS getdirstripe -i $DIR/$tdir/d$i)
-               [ $stripe_index -ne $max ] ||
-                       error "don't expect $max"
-       done
-}
-run_test 413a "mkdir on less full MDTs"
-
-test_413b() {
-       [ $MDSCOUNT -lt 2 ] &&
-               skip "We need at least 2 MDTs for this test"
-
-       [ $MDS1_VERSION -lt $(version_code 2.12.52) ] &&
-               skip "Need server version at least 2.12.52"
-
-       mkdir $DIR/$tdir || error "mkdir failed"
-       $LFS setdirstripe -D -i -1 -H space $DIR/$tdir ||
-               error "setdirstripe failed"
+test_qos_mkdir() {
+       local mkdir_cmd=$1
+       local stripe_count=$2
+       local mdts=$(comma_list $(mdts_nodes))
 
-       local qos_prio_free
-       local qos_threshold_rr
+       local testdir
+       local lmv_qos_prio_free
+       local lmv_qos_threshold_rr
+       local lmv_qos_maxage
+       local lod_qos_prio_free
+       local lod_qos_threshold_rr
+       local lod_qos_maxage
        local count
+       local i
 
-       qos_prio_free=$($LCTL get_param -n lmv.*.qos_prio_free | head -n1)
-       qos_prio_free=${qos_prio_free%%%}
-       qos_threshold_rr=$($LCTL get_param -n lmv.*.qos_threshold_rr | head -n1)
-       qos_threshold_rr=${qos_threshold_rr%%%}
-       qos_maxage=$($LCTL get_param -n lmv.*.qos_maxage)
-
-       stack_trap "$LCTL set_param lmv.*.qos_prio_free=$qos_prio_free" EXIT
-       stack_trap "$LCTL set_param lmv.*.qos_threshold_rr=$qos_threshold_rr" \
+       lmv_qos_prio_free=$($LCTL get_param -n lmv.*.qos_prio_free | head -n1)
+       lmv_qos_prio_free=${lmv_qos_prio_free%%%}
+       lmv_qos_threshold_rr=$($LCTL get_param -n lmv.*.qos_threshold_rr |
+               head -n1)
+       lmv_qos_threshold_rr=${lmv_qos_threshold_rr%%%}
+       lmv_qos_maxage=$($LCTL get_param -n lmv.*.qos_maxage)
+       stack_trap "$LCTL set_param \
+               lmv.*.qos_prio_free=$lmv_qos_prio_free > /dev/null" EXIT
+       stack_trap "$LCTL set_param \
+               lmv.*.qos_threshold_rr=$lmv_qos_threshold_rr > /dev/null" EXIT
+       stack_trap "$LCTL set_param \
+               lmv.*.qos_maxage=$lmv_qos_maxage > /dev/null" EXIT
+
+       lod_qos_prio_free=$(do_facet mds1 $LCTL get_param -n \
+               lod.lustre-MDT0000-mdtlov.mdt_qos_prio_free | head -n1)
+       lod_qos_prio_free=${lod_qos_prio_free%%%}
+       lod_qos_threshold_rr=$(do_facet mds1 $LCTL get_param -n \
+               lod.lustre-MDT0000-mdtlov.mdt_qos_thresholdrr | head -n1)
+       lod_qos_threshold_rr=${lod_qos_threshold_rr%%%}
+       lod_qos_maxage=$(do_facet mds1 $LCTL get_param -n \
+               lod.lustre-MDT0000-mdtlov.qos_maxage | awk '{ print $1 }')
+       stack_trap "do_nodes $mdts $LCTL set_param \
+               lod.*.mdt_qos_prio_free=$lod_qos_prio_free > /dev/null" EXIT
+       stack_trap "do_nodes $mdts $LCTL set_param \
+               lod.*.mdt_qos_thresholdrr=$lod_qos_threshold_rr > /dev/null" \
                EXIT
-       stack_trap "$LCTL set_param lmv.*.qos_maxage=$qos_maxage" EXIT
+       stack_trap "do_nodes $mdts $LCTL set_param \
+               lod.*.mdt_qos_maxage=$lod_qos_maxage > /dev/null" EXIT
+
+       echo
+       echo "Mkdir (stripe_count $stripe_count) roundrobin:"
 
-       echo "mkdir with roundrobin"
+       $LCTL set_param lmv.*.qos_threshold_rr=100 > /dev/null
+       do_nodes $mdts $LCTL set_param lod.*.mdt_qos_thresholdrr=100 > /dev/null
+
+       testdir=$DIR/$tdir-s$stripe_count/rr
 
-       $LCTL set_param lmv.*.qos_threshold_rr=100
        for i in $(seq $((100 * MDSCOUNT))); do
-               mkdir $DIR/$tdir/subdir$i || error "mkdir subdir$i failed"
+               eval $mkdir_cmd $testdir/subdir$i ||
+                       error "$mkdir_cmd subdir$i failed"
        done
+
        for i in $(seq $MDSCOUNT); do
-               count=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$((i - 1))$ |
-                       wc -w)
+               count=$($LFS getdirstripe -i $testdir/* |
+                               grep ^$((i - 1))$ | wc -l)
                echo "$count directories created on MDT$((i - 1))"
                [ $count -eq 100 ] || error "subdirs are not evenly distributed"
+
+               if [ $stripe_count -gt 1 ]; then
+                       count=$($LFS getdirstripe $testdir/* |
+                               grep -P "^\s+$((i - 1))\t" | wc -l)
+                       echo "$count stripes created on MDT$((i - 1))"
+                       # deviation should < 5% of average
+                       [ $count -lt $((95 * stripe_count)) ] ||
+                       [ $count -gt $((105 * stripe_count)) ] &&
+                               error "stripes are not evenly distributed"
+               fi
        done
 
-       rm -rf $DIR/$tdir/*
+       $LCTL set_param lmv.*.qos_threshold_rr=$lmv_qos_threshold_rr > /dev/null
+       do_nodes $mdts $LCTL set_param \
+               lod.*.mdt_qos_thresholdrr=$lod_qos_threshold_rr > /dev/null
 
-       $LCTL set_param lmv.*.qos_threshold_rr=$qos_threshold_rr
-       # Shorten statfs result age, so that it can be updated in time
-       $LCTL set_param lmv.*.qos_maxage=1
-       sleep_maxage
+       echo
+       echo "Check for uneven MDTs: "
 
        local ffree
        local bavail
@@ -20867,9 +20866,8 @@ test_413b() {
 
        # Check if we need to generate uneven MDTs
        local threshold=50
-       local diff=$(((max - min ) * 100 / min))
+       local diff=$(((max - min) * 100 / min))
        local value="$(generate_string 1024)"
-       local i
 
        while [ $diff -lt $threshold ]; do
                # generate uneven MDTs, create till $threshold% diff
@@ -20884,11 +20882,11 @@ test_413b() {
                        error "mkdir $tdir-MDT$min_index failed"
                for i in $(seq $count); do
                        $OPENFILE -f O_CREAT:O_LOV_DELAY_CREATE \
-                               $DIR/$tdir-MDT$min_index/f$i > /dev/null ||
-                               error "create f$i failed"
+                               $DIR/$tdir-MDT$min_index/f$j_$i > /dev/null ||
+                               error "create f$j_$i failed"
                        setfattr -n user.413b -v $value \
-                               $DIR/$tdir-MDT$min_index/f$i ||
-                               error "setfattr f$i failed"
+                               $DIR/$tdir-MDT$min_index/f$j_$i ||
+                               error "setfattr f$j_$i failed"
                done
 
                ffree=($(lctl get_param -n mdc.*[mM][dD][cC]-*.filesfree))
@@ -20904,31 +20902,95 @@ test_413b() {
        echo "MDT blocks available: ${bavail[@]}"
        echo "weight diff=$diff%"
 
-       echo "mkdir with balanced space usage"
-       $LCTL set_param lmv.*.qos_prio_free=100
+       echo
+       echo "Mkdir (stripe_count $stripe_count) with balanced space usage:"
+
+       $LCTL set_param lmv.*.qos_prio_free=100 > /dev/null
+       do_nodes $mdts $LCTL set_param lod.*.mdt_qos_prio_free=100 > /dev/null
+       # decrease statfs age, so that it can be updated in time
+       $LCTL set_param lmv.*.qos_maxage=1 > /dev/null
+       do_nodes $mdts $LCTL set_param lod.*.mdt_qos_maxage=1 > /dev/null
+
+       sleep 1
+
+       testdir=$DIR/$tdir-s$stripe_count/qos
+
        for i in $(seq $((100 * MDSCOUNT))); do
-               mkdir $DIR/$tdir/subdir$i || error "mkdir subdir$i failed"
+               eval $mkdir_cmd $testdir/subdir$i ||
+                       error "$mkdir_cmd subdir$i failed"
        done
 
        for i in $(seq $MDSCOUNT); do
-               count=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$((i - 1))$ |
-                       wc -w)
+               count=$($LFS getdirstripe -i $testdir/* | grep ^$((i - 1))$ |
+                       wc -l)
                echo "$count directories created on MDT$((i - 1))"
+
+               if [ $stripe_count -gt 1 ]; then
+                       count=$($LFS getdirstripe $testdir/* |
+                               grep -P "^\s+$((i - 1))\t" | wc -l)
+                       echo "$count stripes created on MDT$((i - 1))"
+               fi
        done
 
-       max=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$max_index$ | wc -l)
-       min=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$min_index$ | wc -l)
+       max=$($LFS getdirstripe -i $testdir/* | grep ^$max_index$ | wc -l)
+       min=$($LFS getdirstripe -i $testdir/* | grep ^$min_index$ | wc -l)
 
+       # D-value should > 10% of averge
        [ $((max - min)) -lt 10 ] &&
                error "subdirs shouldn't be evenly distributed"
 
-       which getfattr > /dev/null 2>&1 || skip_env "no getfattr command"
+       # ditto
+       if [ $stripe_count -gt 1 ]; then
+               max=$($LFS getdirstripe $testdir/* |
+                       grep -P "^\s+$max_index\t" | wc -l)
+               min=$($LFS getdirstripe $testdir/* |
+                       grep -P "^\s+$min_index\t" | wc -l)
+               [ $((max - min)) -le $((10 * stripe_count)) ] &&
+                       error "stripes shouldn't be evenly distributed"|| true
+       fi
+}
+
+test_413a() {
+       [ $MDSCOUNT -lt 2 ] &&
+               skip "We need at least 2 MDTs for this test"
 
-       $LFS setdirstripe -D -d $DIR/$tdir || error "setdirstripe -d failed"
-       getfattr -n trusted.dmv $DIR/$tdir &&
-               error "default dir layout exists" || true
+       [ $MDS1_VERSION -lt $(version_code 2.12.52) ] &&
+               skip "Need server version at least 2.12.52"
+
+       local stripe_count
+
+       for stripe_count in $(seq 1 $((MDSCOUNT - 1))); do
+               mkdir $DIR/$tdir-s$stripe_count || error "mkdir failed"
+               mkdir $DIR/$tdir-s$stripe_count/rr || error "mkdir failed"
+               mkdir $DIR/$tdir-s$stripe_count/qos || error "mkdir failed"
+               test_qos_mkdir "$LFS mkdir -c $stripe_count" $stripe_count
+       done
+}
+run_test 413a "QoS mkdir with 'lfs mkdir -i -1'"
+
+test_413b() {
+       [ $MDSCOUNT -lt 2 ] &&
+               skip "We need at least 2 MDTs for this test"
+
+       [ $MDS1_VERSION -lt $(version_code 2.12.52) ] &&
+               skip "Need server version at least 2.12.52"
+
+       local stripe_count
+
+       for stripe_count in $(seq 1 $((MDSCOUNT - 1))); do
+               mkdir $DIR/$tdir-s$stripe_count || error "mkdir failed"
+               mkdir $DIR/$tdir-s$stripe_count/rr || error "mkdir failed"
+               mkdir $DIR/$tdir-s$stripe_count/qos || error "mkdir failed"
+               $LFS setdirstripe -D -c $stripe_count \
+                       $DIR/$tdir-s$stripe_count/rr ||
+                       error "setdirstripe failed"
+               $LFS setdirstripe -D -c $stripe_count \
+                       $DIR/$tdir-s$stripe_count/qos ||
+                       error "setdirstripe failed"
+               test_qos_mkdir "mkdir" $stripe_count
+       done
 }
-run_test 413b "mkdir with balanced space usage"
+run_test 413b "QoS mkdir under dir whose default LMV starting MDT offset is -1"
 
 test_414() {
 #define OBD_FAIL_PTLRPC_BULK_ATTACH      0x521
index 6364a1c..079d2a0 100644 (file)
@@ -710,9 +710,6 @@ static int check_hashtype(const char *hashtype)
                if (strcmp(hashtype, mdt_hash_name[i]) == 0)
                        return i;
 
-       if (!strcmp(hashtype, LMV_HASH_NAME_SPACE))
-               return LMV_HASH_TYPE_DEFAULT | LMV_HASH_FLAG_SPACE;
-
        return 0;
 }
 
@@ -5592,28 +5589,6 @@ static int mntdf(char *mntdir, char *fsname, char *pool, enum mntdf_flags flags,
        return rc;
 }
 
-static int ll_statfs_data_comp(const void *sd1, const void *sd2)
-{
-       const struct obd_statfs *st1 = &((const struct ll_statfs_data *)sd1)->
-                                               sd_st;
-       const struct obd_statfs *st2 = &((const struct ll_statfs_data *)sd2)->
-                                               sd_st;
-       int r1 = obd_statfs_ratio(st1, false);
-       int r2 = obd_statfs_ratio(st2, false);
-       int64_t result = r1 - r2;
-
-       /* if both space usage are above 90, compare free inodes */
-       if (r1 > 90 && r2 > 90)
-               result = st2->os_ffree - st1->os_ffree;
-
-       if (result < 0)
-               return -1;
-       else if (result == 0)
-               return 0;
-       else
-               return 1;
-}
-
 /* functions */
 static int lfs_setdirstripe(int argc, char **argv)
 {
@@ -5626,12 +5601,9 @@ static int lfs_setdirstripe(int argc, char **argv)
        char *mode_opt = NULL;
        bool default_stripe = false;
        bool delete = false;
-       bool auto_distributed = false;
        bool foreign_mode = false;
        mode_t mode = S_IRWXU | S_IRWXG | S_IRWXO;
        mode_t previous_mode = 0;
-       struct ll_statfs_buf *lsb = NULL;
-       char mntdir[PATH_MAX] = "";
        char *xattr = NULL;
        __u32 type = LU_FOREIGN_TYPE_DAOS, flags = 0;
        struct option long_opts[] = {
@@ -5892,21 +5864,6 @@ static int lfs_setdirstripe(int argc, char **argv)
                memcpy(param->lsp_tgts, mdts, sizeof(*mdts) * lsa.lsa_nr_tgts);
        }
 
-       if (!default_stripe && (lsa.lsa_pattern & LMV_HASH_FLAG_SPACE)) {
-               fprintf(stderr, "%s %s: can only specify -H space with -D\n",
-                       progname, argv[0]);
-               free(param);
-               return CMD_HELP;
-       }
-
-       if (param->lsp_stripe_offset != -1 &&
-           lsa.lsa_pattern & LMV_HASH_FLAG_SPACE) {
-               fprintf(stderr, "%s %s: can only specify -H space with -i -1\n",
-                       progname, argv[0]);
-               free(param);
-               return CMD_HELP;
-       }
-
        dname = argv[optind];
        do {
                if (default_stripe) {
@@ -5918,100 +5875,6 @@ static int lfs_setdirstripe(int argc, char **argv)
                        continue;
                }
 
-               /*
-                * if current \a dname isn't under the same \a mntdir as the
-                * last one, and the last one was auto-distributed, restore
-                * \a param.
-                */
-               if (mntdir[0] != '\0' &&
-                   strncmp(dname, mntdir, strlen(mntdir)) &&
-                   auto_distributed) {
-                       param->lsp_is_specific = false;
-                       param->lsp_stripe_offset = -1;
-                       auto_distributed = false;
-               }
-
-               /*
-                * TODO: when MDT can allocate object with QoS (LU-9435), below
-                * code should be removed, instead we should let LMV to allocate
-                * the starting MDT object, and then let LOD allocate other MDT
-                * objects.
-                */
-               if (!param->lsp_is_specific && param->lsp_stripe_offset == -1) {
-                       char path[PATH_MAX] = "";
-
-                       if (!lsb) {
-                               lsb = malloc(sizeof(*lsb));
-                               if (!lsb) {
-                                       result = -ENOMEM;
-                                       break;
-                               }
-                       }
-                       lsb->sb_count = 0;
-
-                       /* use mntdir for dirname() temporarily */
-                       strncpy(mntdir, dname, sizeof(mntdir) - 1);
-                       if (!realpath(dirname(mntdir), path)) {
-                               result = -errno;
-                               fprintf(stderr,
-                                       "error: invalid path '%s': %s\n",
-                                       argv[optind], strerror(errno));
-                               break;
-                       }
-                       mntdir[0] = '\0';
-
-                       result = llapi_search_mounts(path, 0, mntdir, NULL);
-                       if (result < 0 || mntdir[0] == '\0') {
-                               fprintf(stderr,
-                                       "No suitable Lustre mount found\n");
-                               break;
-                       }
-
-                       result = mntdf(mntdir, NULL, NULL, 0, LL_STATFS_LMV,
-                                      lsb);
-                       if (result < 0)
-                               break;
-
-                       if (param->lsp_stripe_count > lsb->sb_count) {
-                               fprintf(stderr,
-                                       "error: stripe count %d is too big\n",
-                                       param->lsp_stripe_count);
-                               result = -ERANGE;
-                               break;
-                       }
-
-                       qsort(lsb->sb_buf, lsb->sb_count,
-                             sizeof(struct ll_statfs_data),
-                             ll_statfs_data_comp);
-
-                       auto_distributed = true;
-               }
-
-               if (auto_distributed) {
-                       int r;
-                       int nr = MAX(param->lsp_stripe_count,
-                                    lsb->sb_count / 2);
-
-                       /* don't use server whose usage is above 90% */
-                       while (nr != param->lsp_stripe_count &&
-                              obd_statfs_ratio(&lsb->sb_buf[nr].sd_st, false) >
-                              90)
-                               nr = MAX(param->lsp_stripe_count, nr / 2);
-
-                       /* get \a r between [0, nr) */
-                       r = rand() % nr;
-
-                       param->lsp_stripe_offset = lsb->sb_buf[r].sd_index;
-                       if (param->lsp_stripe_count > 1) {
-                               int i = 0;
-
-                               param->lsp_is_specific = true;
-                               for (; i < param->lsp_stripe_count; i++)
-                                       param->lsp_tgts[(i + r) % nr] =
-                                               lsb->sb_buf[i].sd_index;
-                       }
-               }
-
                result = llapi_dir_create(dname, mode, param);
                if (result)
                        fprintf(stderr,
@@ -6022,7 +5885,6 @@ static int lfs_setdirstripe(int argc, char **argv)
        if (mode_opt != NULL)
                umask(previous_mode);
 
-       free(lsb);
        free(param);
        return result;
 }
@@ -6095,7 +5957,7 @@ static int lfs_mv(int argc, char **argv)
                }
        }
 
-       if (lmu.lum_stripe_offset == -1) {
+       if (lmu.lum_stripe_offset == LMV_OFFSET_DEFAULT) {
                fprintf(stderr, "%s mv: MDT index must be specified\n",
                        progname);
                return CMD_HELP;
index 848a72d..754536f 100644 (file)
@@ -3109,8 +3109,6 @@ void lmv_dump_user_lmm(struct lmv_user_md *lum, char *pool_name,
                else
                        llapi_printf(LLAPI_MSG_NORMAL, "%#x", type);
 
-               if (flags & LMV_HASH_FLAG_SPACE)
-                       llapi_printf(LLAPI_MSG_NORMAL, ",space");
                if (flags & LMV_HASH_FLAG_MIGRATION)
                        llapi_printf(LLAPI_MSG_NORMAL, ",migrating");
                if (flags & LMV_HASH_FLAG_BAD_TYPE)
@@ -5209,7 +5207,7 @@ static int cb_getstripe(char *path, DIR *parent, DIR **dirp, void *data,
 
                                lum->lum_magic = LMV_USER_MAGIC;
                                lum->lum_stripe_count = 0;
-                               lum->lum_stripe_offset = -1;
+                               lum->lum_stripe_offset = LMV_OFFSET_DEFAULT;
                                goto dump;
                        } else if (param->fp_get_lmv) {
                                struct lmv_user_md *lum = param->fp_lmv_md;
index 74e4a31..a4d6a4b 100644 (file)
@@ -851,7 +851,6 @@ check_lmv_mds_md_v1(void)
        CHECK_CDEFINE(LMV_MAGIC_V1);
        CHECK_CDEFINE(LMV_MAGIC_STRIPE);
        CHECK_CDEFINE(LMV_HASH_TYPE_MASK);
-       CHECK_CDEFINE(LMV_HASH_FLAG_SPACE);
        CHECK_CDEFINE(LMV_HASH_FLAG_LOST_LMV);
        CHECK_CDEFINE(LMV_HASH_FLAG_BAD_TYPE);
        CHECK_CDEFINE(LMV_HASH_FLAG_MIGRATION);
index 2f7dc2f..68b6691 100644 (file)
@@ -1882,7 +1882,6 @@ void lustre_assert_wire_constants(void)
        CLASSERT(LMV_MAGIC_V1 == 0x0CD20CD0);
        CLASSERT(LMV_MAGIC_STRIPE == 0x0CD40CD0);
        CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff);
-       CLASSERT(LMV_HASH_FLAG_SPACE == 0x08000000);
        CLASSERT(LMV_HASH_FLAG_LOST_LMV == 0x10000000);
        CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000);
        CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000);