Whamcloud - gitweb
LU-12624 lod: alloc dir stripes by QoS
[fs/lustre-release.git] / lustre / lmv / lmv_obd.c
index 6747995..8af14da 100644 (file)
@@ -63,11 +63,12 @@ static int lmv_check_connect(struct obd_device *obd);
 void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
                         int activate)
 {
-        if (tgt->ltd_active == activate)
-                return;
+       if (tgt->ltd_active == activate)
+               return;
 
-        tgt->ltd_active = activate;
-        lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
+       tgt->ltd_active = activate;
+       lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count +=
+               (activate ? 1 : -1);
 
        tgt->ltd_exp->exp_obd->obd_inactive = !activate;
 }
@@ -343,11 +344,11 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 
        tgt->ltd_active = 1;
        tgt->ltd_exp = mdc_exp;
-       lmv->desc.ld_active_tgt_count++;
+       lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count++;
 
        md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize);
 
-       rc = lqos_add_tgt(&lmv->lmv_qos, tgt);
+       rc = lu_qos_add_tgt(&lmv->lmv_qos, tgt);
        if (rc) {
                obd_disconnect(mdc_exp);
                RETURN(rc);
@@ -370,8 +371,7 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 static void lmv_del_target(struct lmv_obd *lmv, struct lu_tgt_desc *tgt)
 {
        LASSERT(tgt);
-       lqos_del_tgt(&lmv->lmv_qos, tgt);
-       lu_tgt_descs_del(&lmv->lmv_mdt_descs, tgt);
+       ltd_del_tgt(&lmv->lmv_mdt_descs, tgt);
        OBD_FREE_PTR(tgt);
 }
 
@@ -382,7 +382,6 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
        struct lmv_obd *lmv = &obd->u.lmv;
        struct lmv_tgt_desc *tgt;
        struct lu_tgt_descs *ltd = &lmv->lmv_mdt_descs;
-       int orig_tgt_count = 0;
        int rc = 0;
 
        ENTRY;
@@ -406,11 +405,7 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
        tgt->ltd_active = 0;
 
        mutex_lock(&ltd->ltd_mutex);
-       rc = lu_tgt_descs_add(ltd, tgt);
-       if (!rc && index >= lmv->desc.ld_tgt_count) {
-               orig_tgt_count = lmv->desc.ld_tgt_count;
-               lmv->desc.ld_tgt_count = index + 1;
-       }
+       rc = ltd_add_tgt(ltd, tgt);
        mutex_unlock(&ltd->ltd_mutex);
 
        if (rc)
@@ -421,14 +416,10 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                RETURN(0);
 
        rc = lmv_connect_mdc(obd, tgt);
-       if (rc != 0) {
-               mutex_lock(&ltd->ltd_mutex);
-               lmv->desc.ld_tgt_count = orig_tgt_count;
-               memset(tgt, 0, sizeof(*tgt));
-               mutex_unlock(&ltd->ltd_mutex);
-       } else {
+       if (!rc) {
                int easize = sizeof(struct lmv_stripe_md) +
-                       lmv->desc.ld_tgt_count * sizeof(struct lu_fid);
+                       lmv->lmv_mdt_count * sizeof(struct lu_fid);
+
                lmv_init_ea_size(obd->obd_self_export, easize, 0);
        }
 
@@ -455,7 +446,7 @@ static int lmv_check_connect(struct obd_device *obd)
        if (lmv->connected)
                GOTO(unlock, rc = 0);
 
-       if (lmv->desc.ld_tgt_count == 0) {
+       if (!lmv->lmv_mdt_count) {
                CERROR("%s: no targets configured: rc = -EINVAL\n",
                       obd->obd_name);
                GOTO(unlock, rc = -EINVAL);
@@ -477,7 +468,7 @@ static int lmv_check_connect(struct obd_device *obd)
        }
 
        lmv->connected = 1;
-       easize = lmv_mds_md_size(lmv->desc.ld_tgt_count, LMV_MAGIC);
+       easize = lmv_mds_md_size(lmv->lmv_mdt_count, LMV_MAGIC);
        lmv_init_ea_size(obd->obd_self_export, easize, 0);
        EXIT;
 unlock:
@@ -491,7 +482,7 @@ out_disc:
                if (!tgt->ltd_exp)
                        continue;
 
-               --lmv->desc.ld_active_tgt_count;
+               --lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count;
                obd_disconnect(tgt->ltd_exp);
        }
 
@@ -827,7 +818,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
        struct lmv_obd *lmv = &obddev->u.lmv;
        struct lu_tgt_desc *tgt = NULL;
        int set = 0;
-       __u32 count = lmv->desc.ld_tgt_count;
+       __u32 count = lmv->lmv_mdt_count;
        int rc = 0;
 
        ENTRY;
@@ -843,7 +834,8 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
                __u32 index;
 
                memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
-               if (index >= count)
+
+               if (index >= lmv->lmv_mdt_descs.ltd_tgts_size)
                        RETURN(-ENODEV);
 
                tgt = lmv_tgt(lmv, index);
@@ -876,12 +868,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
                struct obd_quotactl *oqctl;
 
                if (qctl->qc_valid == QC_MDTIDX) {
-                       if (count <= qctl->qc_idx)
-                               RETURN(-EINVAL);
-
                        tgt = lmv_tgt(lmv, qctl->qc_idx);
-                       if (!tgt || !tgt->ltd_exp)
-                               RETURN(-EINVAL);
                } else if (qctl->qc_valid == QC_UUID) {
                        lmv_foreach_tgt(lmv, tgt) {
                                if (!obd_uuid_equals(&tgt->ltd_uuid,
@@ -897,10 +884,9 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
                        RETURN(-EINVAL);
                }
 
-               if (tgt->ltd_index >= count)
-                       RETURN(-EAGAIN);
+               if (!tgt || !tgt->ltd_exp)
+                       RETURN(-EINVAL);
 
-               LASSERT(tgt != NULL && tgt->ltd_exp != NULL);
                OBD_ALLOC_PTR(oqctl);
                if (!oqctl)
                        RETURN(-ENOMEM);
@@ -1069,111 +1055,38 @@ hsm_req_err:
        RETURN(rc);
 }
 
-/**
- * This is _inode_ placement policy function (not name).
- */
-static u32 lmv_placement_policy(struct obd_device *obd,
-                               struct md_op_data *op_data)
+int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
+                 struct lu_fid *fid, struct md_op_data *op_data)
 {
+       struct obd_device *obd = class_exp2obd(exp);
        struct lmv_obd *lmv = &obd->u.lmv;
-       struct lmv_user_md *lum;
-       u32 mdt;
-
-       ENTRY;
-
-       if (lmv->desc.ld_tgt_count == 1)
-               RETURN(0);
-
-       lum = op_data->op_data;
-       /*
-        * Choose MDT by
-        * 1. See if the stripe offset is specified by lum.
-        * 2. If parent has default LMV, and its hash type is "space", choose
-        *    MDT with QoS. (see lmv_locate_tgt_qos()).
-        * 3. Then check if default LMV stripe offset is not -1.
-        * 4. Finally choose MDS by name hash if the parent
-        *    is striped directory. (see lmv_locate_tgt()).
-        *
-        * presently explicit MDT location is not supported
-        * for foreign dirs (as it can't be embedded into free
-        * format LMV, like with lum_stripe_offset), so we only
-        * rely on default stripe offset or then name hashing.
-        */
-       if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL &&
-           le32_to_cpu(lum->lum_magic != LMV_MAGIC_FOREIGN) &&
-           le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) {
-               mdt = le32_to_cpu(lum->lum_stripe_offset);
-       } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
-                  !lmv_dir_striped(op_data->op_mea1) &&
-                  lmv_dir_qos_mkdir(op_data->op_default_mea1)) {
-               mdt = op_data->op_mds;
-       } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
-                  op_data->op_default_mea1 &&
-                  op_data->op_default_mea1->lsm_md_master_mdt_index !=
-                       (__u32)-1) {
-               mdt = op_data->op_default_mea1->lsm_md_master_mdt_index;
-               op_data->op_mds = mdt;
-       } else {
-               mdt = op_data->op_mds;
-       }
-
-       RETURN(mdt);
-}
-
-int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
-{
        struct lmv_tgt_desc *tgt;
        int rc;
 
        ENTRY;
 
-       tgt = lmv_tgt(lmv, mds);
+       LASSERT(op_data);
+       LASSERT(fid);
+
+       tgt = lmv_tgt(lmv, op_data->op_mds);
        if (!tgt)
                RETURN(-ENODEV);
 
+       if (!tgt->ltd_active || !tgt->ltd_exp)
+               RETURN(-ENODEV);
+
        /*
         * New seq alloc and FLD setup should be atomic. Otherwise we may find
         * on server that seq in new allocated fid is not yet known.
         */
        mutex_lock(&tgt->ltd_fid_mutex);
-
-       if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL)
-               GOTO(out, rc = -ENODEV);
-
-       /*
-        * Asking underlying tgt layer to allocate new fid.
-        */
        rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL);
+       mutex_unlock(&tgt->ltd_fid_mutex);
        if (rc > 0) {
                LASSERT(fid_is_sane(fid));
                rc = 0;
        }
 
-       EXIT;
-out:
-       mutex_unlock(&tgt->ltd_fid_mutex);
-       return rc;
-}
-
-int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
-                 struct lu_fid *fid, struct md_op_data *op_data)
-{
-       struct obd_device *obd = class_exp2obd(exp);
-       struct lmv_obd *lmv = &obd->u.lmv;
-       u32 mds;
-       int rc;
-
-       ENTRY;
-
-       LASSERT(op_data != NULL);
-       LASSERT(fid != NULL);
-
-       mds = lmv_placement_policy(obd, op_data);
-
-       rc = __lmv_fid_alloc(lmv, fid, mds);
-       if (rc)
-               CERROR("Can't alloc new fid, rc %d\n", rc);
-
        RETURN(rc);
 }
 
@@ -1199,27 +1112,17 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                RETURN(-EINVAL);
        }
 
-       obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
-       lmv->desc.ld_tgt_count = 0;
-       lmv->desc.ld_active_tgt_count = 0;
-       lmv->desc.ld_qos_maxage = LMV_DESC_QOS_MAXAGE_DEFAULT;
+       obd_str2uuid(&lmv->lmv_mdt_descs.ltd_lmv_desc.ld_uuid,
+                    desc->ld_uuid.uuid);
+       lmv->lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count = 0;
+       lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count = 0;
+       lmv->lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage =
+               LMV_DESC_QOS_MAXAGE_DEFAULT;
        lmv->max_def_easize = 0;
        lmv->max_easize = 0;
 
        spin_lock_init(&lmv->lmv_lock);
 
-       /* Set up allocation policy (QoS and RR) */
-       INIT_LIST_HEAD(&lmv->lmv_qos.lq_svr_list);
-       init_rwsem(&lmv->lmv_qos.lq_rw_sem);
-       lmv->lmv_qos.lq_dirty = 1;
-       lmv->lmv_qos.lq_reset = 1;
-       /* Default priority is toward free space balance */
-       lmv->lmv_qos.lq_prio_free = 232;
-       /* Default threshold for rr (roughly 17%) */
-       lmv->lmv_qos.lq_threshold_rr = 43;
-
-       lu_qos_rr_init(&lmv->lmv_qos.lq_rr);
-
        /*
         * initialize rr_index to lower 32bit of netid, so that client
         * can distribute subdirs evenly from the beginning.
@@ -1241,7 +1144,7 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
        if (rc)
                CERROR("Can't init FLD, err %d\n", rc);
 
-       rc = lu_tgt_descs_init(&lmv->lmv_mdt_descs);
+       rc = lu_tgt_descs_init(&lmv->lmv_mdt_descs, true);
        if (rc)
                CWARN("%s: error initialize target table: rc = %d\n",
                      obd->obd_name, rc);
@@ -1304,7 +1207,7 @@ static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
        if (flags & OBD_STATFS_FOR_MDT0)
                return 0;
 
-       if (lmv->lmv_statfs_start || lmv->desc.ld_tgt_count == 1)
+       if (lmv->lmv_statfs_start || lmv->lmv_mdt_count == 1)
                return lmv->lmv_statfs_start;
 
        /* choose initial MDT for this client */
@@ -1317,8 +1220,8 @@ static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags)
                        /* We dont need a full 64-bit modulus, just enough
                         * to distribute the requests across MDTs evenly.
                         */
-                       lmv->lmv_statfs_start =
-                               (u32)lnet_id.nid % lmv->desc.ld_tgt_count;
+                       lmv->lmv_statfs_start = (u32)lnet_id.nid %
+                                               lmv->lmv_mdt_count;
                        break;
                }
        }
@@ -1346,8 +1249,8 @@ static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
        /* distribute statfs among MDTs */
        idx = lmv_select_statfs_mdt(lmv, flags);
 
-       for (i = 0; i < lmv->desc.ld_tgt_count; i++, idx++) {
-               idx = idx % lmv->desc.ld_tgt_count;
+       for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++, idx++) {
+               idx = idx % lmv->lmv_mdt_descs.ltd_tgts_size;
                tgt = lmv_tgt(lmv, idx);
                if (!tgt || !tgt->ltd_exp)
                        continue;
@@ -1423,7 +1326,7 @@ int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt)
        int rc;
 
        if (ktime_get_seconds() - tgt->ltd_statfs_age <
-           obd->u.lmv.desc.ld_qos_maxage)
+           obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage)
                return 0;
 
        rc = obd_statfs_async(tgt->ltd_exp, &oinfo, 0, NULL);
@@ -1555,6 +1458,89 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
        RETURN(rc);
 }
 
+static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
+{
+       struct lu_tgt_desc *tgt;
+       __u64 total_weight = 0;
+       __u64 cur_weight = 0;
+       __u64 rand;
+       int rc;
+
+       ENTRY;
+
+       if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+               RETURN(ERR_PTR(-EAGAIN));
+
+       down_write(&lmv->lmv_qos.lq_rw_sem);
+
+       if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+               GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+
+       rc = ltd_qos_penalties_calc(&lmv->lmv_mdt_descs);
+       if (rc)
+               GOTO(unlock, tgt = ERR_PTR(rc));
+
+       lmv_foreach_tgt(lmv, tgt) {
+               tgt->ltd_qos.ltq_usable = 0;
+               if (!tgt->ltd_exp || !tgt->ltd_active)
+                       continue;
+
+               tgt->ltd_qos.ltq_usable = 1;
+               lu_tgt_qos_weight_calc(tgt);
+               total_weight += tgt->ltd_qos.ltq_weight;
+       }
+
+       rand = lu_prandom_u64_max(total_weight);
+
+       lmv_foreach_connected_tgt(lmv, tgt) {
+               if (!tgt->ltd_qos.ltq_usable)
+                       continue;
+
+               cur_weight += tgt->ltd_qos.ltq_weight;
+               if (cur_weight < rand)
+                       continue;
+
+               *mdt = tgt->ltd_index;
+               ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight);
+               GOTO(unlock, rc = 0);
+       }
+
+       /* no proper target found */
+       GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+unlock:
+       up_write(&lmv->lmv_qos.lq_rw_sem);
+
+       return tgt;
+}
+
+static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
+{
+       struct lu_tgt_desc *tgt;
+       int i;
+       int index;
+
+       ENTRY;
+
+       spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+       for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++) {
+               index = (i + lmv->lmv_qos_rr_index) %
+                       lmv->lmv_mdt_descs.ltd_tgts_size;
+               tgt = lmv_tgt(lmv, index);
+               if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+                       continue;
+
+               *mdt = tgt->ltd_index;
+               lmv->lmv_qos_rr_index = (*mdt + 1) %
+                                       lmv->lmv_mdt_descs.ltd_tgts_size;
+               spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+
+               RETURN(tgt);
+       }
+       spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+
+       RETURN(ERR_PTR(-ENODEV));
+}
+
 static struct lmv_tgt_desc *
 lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
                       const char *name, int namelen, struct lu_fid *fid,
@@ -1600,8 +1586,7 @@ lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
  * which is set outside, and if dir is migrating, 'op_data->op_post_migrate'
  * indicates whether old or new layout is used to locate.
  *
- * For plain direcotry, normally it will locate MDT by FID, but if this
- * directory has default LMV, and its hash type is "space", locate MDT with QoS.
+ * For plain direcotry, it just locate the MDT of op_data->op_fid1.
  *
  * \param[in] lmv      LMV device
  * \param[in] op_data  client MD stack parameters, name, namelen
@@ -1624,7 +1609,7 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data)
         * index if the file under striped dir is being restored, see
         * ct_restore(). */
        if (op_data->op_bias & MDS_CREATE_VOLATILE &&
-           (int)op_data->op_mds != -1) {
+           op_data->op_mds != LMV_OFFSET_DEFAULT) {
                tgt = lmv_tgt(lmv, op_data->op_mds);
                if (!tgt)
                        return ERR_PTR(-ENODEV);
@@ -1652,30 +1637,7 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data)
                op_data->op_mds = oinfo->lmo_mds;
                tgt = lmv_tgt(lmv, oinfo->lmo_mds);
                if (!tgt)
-                       tgt = ERR_PTR(-ENODEV);
-       } else if (op_data->op_code == LUSTRE_OPC_MKDIR &&
-                  lmv_dir_qos_mkdir(op_data->op_default_mea1) &&
-                  !lmv_dir_striped(lsm)) {
-               tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
-               if (tgt == ERR_PTR(-EAGAIN))
-                       tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
-               /*
-                * only update statfs when mkdir under dir with "space" hash,
-                * this means the cached statfs may be stale, and current mkdir
-                * may not follow QoS accurately, but it's not serious, and it
-                * avoids periodic statfs when client doesn't mkdir under
-                * "space" hashed directories.
-                *
-                * TODO: after MDT support QoS object allocation, also update
-                * statfs for 'lfs mkdir -i -1 ...", currently it's done in user
-                * space.
-                */
-               if (!IS_ERR(tgt)) {
-                       struct obd_device *obd;
-
-                       obd = container_of(lmv, struct obd_device, u.lmv);
-                       lmv_statfs_check_update(obd, tgt);
-               }
+                       return ERR_PTR(-ENODEV);
        } else {
                tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1,
                                op_data->op_name, op_data->op_namelen,
@@ -1728,6 +1690,78 @@ lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data)
                                &op_data->op_mds, true);
 }
 
+int lmv_migrate_existence_check(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+       struct lu_tgt_desc *tgt;
+       struct ptlrpc_request *request;
+       int rc;
+
+       LASSERT(lmv_dir_migrating(op_data->op_mea1));
+
+       tgt = lmv_locate_tgt(lmv, op_data);
+       if (IS_ERR(tgt))
+               return PTR_ERR(tgt);
+
+       rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+       if (!rc) {
+               ptlrpc_req_finished(request);
+               return -EEXIST;
+       }
+
+       return rc;
+}
+
+/* mkdir by QoS in two cases:
+ * 1. 'lfs mkdir -i -1'
+ * 2. parent default LMV master_mdt_index is -1
+ *
+ * NB, mkdir by QoS only if parent is not striped, this is to avoid remote
+ * directories under striped directory.
+ */
+static inline bool lmv_op_qos_mkdir(const struct md_op_data *op_data)
+{
+       const struct lmv_stripe_md *lsm = op_data->op_default_mea1;
+       const struct lmv_user_md *lum = op_data->op_data;
+
+       if (op_data->op_code != LUSTRE_OPC_MKDIR)
+               return false;
+
+       if (lmv_dir_striped(op_data->op_mea1))
+               return false;
+
+       if (op_data->op_cli_flags & CLI_SET_MEA && lum &&
+           (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+            le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) &&
+           le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT)
+               return true;
+
+       if (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT)
+               return true;
+
+       return false;
+}
+
+/* 'lfs mkdir -i <specific_MDT>' */
+static inline bool lmv_op_user_specific_mkdir(const struct md_op_data *op_data)
+{
+       const struct lmv_user_md *lum = op_data->op_data;
+
+       return op_data->op_code == LUSTRE_OPC_MKDIR &&
+              op_data->op_cli_flags & CLI_SET_MEA && lum &&
+              (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC ||
+               le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) &&
+              le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT;
+}
+
+/* parent default LMV master_mdt_index is not -1. */
+static inline bool
+lmv_op_default_specific_mkdir(const struct md_op_data *op_data)
+{
+       return op_data->op_code == LUSTRE_OPC_MKDIR &&
+              op_data->op_default_mea1 &&
+              op_data->op_default_mea1->lsm_md_master_mdt_index !=
+                       LMV_OFFSET_DEFAULT;
+}
 int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
                const void *data, size_t datalen, umode_t mode, uid_t uid,
                gid_t gid, cfs_cap_t cap_effective, __u64 rdev,
@@ -1740,7 +1774,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
 
        ENTRY;
 
-       if (!lmv->desc.ld_active_tgt_count)
+       if (!lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count)
                RETURN(-EIO);
 
        if (lmv_dir_bad_hash(op_data->op_mea1))
@@ -1749,20 +1783,9 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
        if (lmv_dir_migrating(op_data->op_mea1)) {
                /*
                 * if parent is migrating, create() needs to lookup existing
-                * name, to avoid creating new file under old layout of
-                * migrating directory, check old layout here.
+                * name in both old and new layout, check old layout on client.
                 */
-               tgt = lmv_locate_tgt(lmv, op_data);
-               if (IS_ERR(tgt))
-                       RETURN(PTR_ERR(tgt));
-
-               rc = md_getattr_name(tgt->ltd_exp, op_data, request);
-               if (!rc) {
-                       ptlrpc_req_finished(*request);
-                       *request = NULL;
-                       RETURN(-EEXIST);
-               }
-
+               rc = lmv_migrate_existence_check(lmv, op_data);
                if (rc != -ENOENT)
                        RETURN(rc);
 
@@ -1773,26 +1796,44 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
        if (IS_ERR(tgt))
                RETURN(PTR_ERR(tgt));
 
-       CDEBUG(D_INODE, "CREATE name '%.*s' on "DFID" -> mds #%x\n",
-               (int)op_data->op_namelen, op_data->op_name,
-               PFID(&op_data->op_fid1), op_data->op_mds);
+       if (lmv_op_qos_mkdir(op_data)) {
+               tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
+               if (tgt == ERR_PTR(-EAGAIN))
+                       tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
+               /*
+                * only update statfs after QoS mkdir, this means the cached
+                * statfs may be stale, and current mkdir may not follow QoS
+                * accurately, but it's not serious, and avoids periodic statfs
+                * when client doesn't mkdir by QoS.
+                */
+               if (!IS_ERR(tgt))
+                       lmv_statfs_check_update(obd, tgt);
+       } else if (lmv_op_user_specific_mkdir(op_data)) {
+               struct lmv_user_md *lum = op_data->op_data;
+
+               op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+               tgt = lmv_tgt(lmv, op_data->op_mds);
+               if (!tgt)
+                       RETURN(-ENODEV);
+       } else if (lmv_op_default_specific_mkdir(op_data)) {
+               op_data->op_mds =
+                       op_data->op_default_mea1->lsm_md_master_mdt_index;
+               tgt = lmv_tgt(lmv, op_data->op_mds);
+               if (!tgt)
+                       RETURN(-ENODEV);
+       }
+
+       if (IS_ERR(tgt))
+               RETURN(PTR_ERR(tgt));
 
        rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data);
        if (rc)
                RETURN(rc);
 
-       if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) {
-               /* Send the create request to the MDT where the object
-                * will be located */
-               tgt = lmv_fid2tgt(lmv, &op_data->op_fid2);
-               if (IS_ERR(tgt))
-                       RETURN(PTR_ERR(tgt));
-
-               op_data->op_mds = tgt->ltd_index;
-       }
-
-       CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n",
-              PFID(&op_data->op_fid2), op_data->op_mds);
+       CDEBUG(D_INODE, "CREATE name '%.*s' "DFID" on "DFID" -> mds #%x\n",
+               (int)op_data->op_namelen, op_data->op_name,
+               PFID(&op_data->op_fid2), PFID(&op_data->op_fid1),
+               op_data->op_mds);
 
        op_data->op_flags |= MF_MDC_CANCEL_FID1;
        rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
@@ -2048,10 +2089,20 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
        if (IS_ERR(child_tgt))
                RETURN(PTR_ERR(child_tgt));
 
-       if (!S_ISDIR(op_data->op_mode) && tp_tgt)
-               rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_index);
-       else
-               rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
+       /* for directory, migrate to MDT specified by lum_stripe_offset;
+        * otherwise migrate to the target stripe of parent, but parent
+        * directory may have finished migration (normally current file too),
+        * allocate FID on MDT lum_stripe_offset, and server will check
+        * whether file was migrated already.
+        */
+       if (S_ISDIR(op_data->op_mode) || !tp_tgt) {
+               struct lmv_user_md *lum = op_data->op_data;
+
+               op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset);
+       } else  {
+               op_data->op_mds = tp_tgt->ltd_index;
+       }
+       rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
        if (rc)
                RETURN(rc);
 
@@ -2879,7 +2930,7 @@ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
                        exp->exp_connect_data = *(struct obd_connect_data *)val;
                RETURN(rc);
        } else if (KEY_IS(KEY_TGT_COUNT)) {
-               *((int *)val) = lmv->desc.ld_tgt_count;
+               *((int *)val) = lmv->lmv_mdt_descs.ltd_tgts_size;
                RETURN(0);
        }
 
@@ -2893,7 +2944,7 @@ static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa,
        struct obd_device *obddev = class_exp2obd(exp);
        struct ptlrpc_request_set *set = _set;
        struct lmv_obd *lmv = &obddev->u.lmv;
-       int tgt_count = lmv->desc.ld_tgt_count;
+       int tgt_count = lmv->lmv_mdt_count;
        struct lu_tgt_desc *tgt;
        struct fid_array *fat, **fas = NULL;
        int i, rc, **rcs = NULL;
@@ -3068,7 +3119,7 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm,
                 * set default value -1, so lmv_locate_tgt() knows this stripe
                 * target is not initialized.
                 */
-               lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1;
+               lsm->lsm_md_oinfo[i].lmo_mds = LMV_OFFSET_DEFAULT;
                if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid))
                        continue;
 
@@ -3280,8 +3331,8 @@ enum ldlm_mode lmv_lock_match(struct obd_export *exp, __u64 flags,
         * since this can be easily found, and only try others if that fails.
         */
        for (i = 0, index = lmv_fid2tgt_index(lmv, fid);
-            i < lmv->desc.ld_tgt_count;
-            i++, index = (index + 1) % lmv->desc.ld_tgt_count) {
+            i < lmv->lmv_mdt_descs.ltd_tgts_size;
+            i++, index = (index + 1) % lmv->lmv_mdt_descs.ltd_tgts_size) {
                if (index < 0) {
                        CDEBUG(D_HA, "%s: "DFID" is inaccessible: rc = %d\n",
                               obd->obd_name, PFID(fid), index);