+ tgt = lmv_fid2tgt(lmv, fid);
+ if (IS_ERR(tgt))
+ RETURN(PTR_ERR(tgt));
+
+ rc = md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, req);
+
+ RETURN(rc);
+}
+
+static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
+ u64 obd_md_valid, const char *name,
+ const void *value, size_t value_size,
+ unsigned int xattr_flags, u32 suppgid,
+ struct ptlrpc_request **req)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ ENTRY;
+
+ tgt = lmv_fid2tgt(lmv, fid);
+ if (IS_ERR(tgt))
+ RETURN(PTR_ERR(tgt));
+
+ rc = md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name,
+ value, value_size, xattr_flags, suppgid, req);
+
+ RETURN(rc);
+}
+
+static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
+ struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ ENTRY;
+
+ tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ RETURN(PTR_ERR(tgt));
+
+ if (op_data->op_flags & MF_GET_MDT_IDX) {
+ op_data->op_mds = tgt->ltd_index;
+ RETURN(0);
+ }
+
+ rc = md_getattr(tgt->ltd_exp, op_data, request);
+
+ RETURN(rc);
+}
+
+static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lu_tgt_desc *tgt;
+
+ ENTRY;
+
+ CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
+
+ /*
+ * With DNE every object can have two locks in different namespaces:
+ * lookup lock in space of MDT storing direntry and update/open lock in
+ * space of MDT storing inode.
+ */
+ lmv_foreach_connected_tgt(lmv, tgt)
+ md_null_inode(tgt->ltd_exp, fid);
+
+ RETURN(0);
+}
+
+static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
+ struct md_open_data *mod, struct ptlrpc_request **request)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ ENTRY;
+
+ tgt = lmv_fid2tgt(lmv, &op_data->op_fid1);
+ if (IS_ERR(tgt))
+ RETURN(PTR_ERR(tgt));
+
+ CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
+ rc = md_close(tgt->ltd_exp, op_data, mod, request);
+ RETURN(rc);
+}
+
+static inline bool tgt_qos_is_usable(struct lmv_obd *lmv,
+ struct lu_tgt_desc *tgt, time64_t now)
+{
+ struct obd_import *imp = class_exp2cliimp(tgt->ltd_exp);
+ u32 maxage = lmv->lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage;
+
+ return tgt->ltd_exp && tgt->ltd_active &&
+ !(tgt->ltd_statfs.os_state & OS_STATFS_NOCREATE) &&
+ (now - imp->imp_setup_time > (maxage >> 1) ||
+ now - lmv->lmv_setup_time < (maxage << 1));
+}
+
+static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv,
+ struct md_op_data *op_data)
+{
+ struct lu_tgt_desc *tgt, *cur = NULL;
+ time64_t now = ktime_get_seconds();
+ __u64 total_avail = 0;
+ __u64 total_weight = 0;
+ __u64 cur_weight = 0;
+ int total_usable = 0;
+ __u64 rand;
+ int rc;
+
+ ENTRY;
+
+ if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+ RETURN(ERR_PTR(-EAGAIN));
+
+ down_write(&lmv->lmv_qos.lq_rw_sem);
+
+ if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+ GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+
+ rc = ltd_qos_penalties_calc(&lmv->lmv_mdt_descs);
+ if (rc)
+ GOTO(unlock, tgt = ERR_PTR(rc));
+
+ lmv_foreach_tgt(lmv, tgt) {
+ if (!tgt_qos_is_usable(lmv, tgt, now)) {
+ tgt->ltd_qos.ltq_usable = 0;
+ continue;
+ }
+ /* update one hour overdue statfs */
+ if (now - tgt->ltd_statfs_age >
+ 60 * lmv->lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage)
+ lmv_statfs_check_update(lmv2obd_dev(lmv), tgt);
+ tgt->ltd_qos.ltq_usable = 1;
+ lu_tgt_qos_weight_calc(tgt, true);
+ if (tgt->ltd_index == op_data->op_mds)
+ cur = tgt;
+ total_avail += tgt->ltd_qos.ltq_avail;
+ total_weight += tgt->ltd_qos.ltq_weight;
+ total_usable++;
+ }
+
+ /* If current MDT has above-average space and dir is not already using
+ * round-robin to spread across more MDTs, stay on the parent MDT
+ * to avoid creating needless remote MDT directories. Remote dirs
+ * close to the root balance space more effectively than bottom dirs,
+ * so prefer to create remote dirs at top level of directory tree.
+ * "16 / (dir_depth + 10)" is the factor to make it less likely
+ * for top-level directories to stay local unless they have more than
+ * average free space, while deep dirs prefer local until more full.
+ * depth=0 -> 160%, depth=3 -> 123%, depth=6 -> 100%,
+ * depth=9 -> 84%, depth=12 -> 73%, depth=15 -> 64%
+ */
+ if (!lmv_op_default_rr_mkdir(op_data)) {
+ rand = total_avail * 16 /
+ (total_usable * (op_data->op_dir_depth + 10));
+ if (cur && cur->ltd_qos.ltq_avail >= rand) {
+ tgt = cur;
+ GOTO(unlock, tgt);
+ }
+ }
+
+ rand = lu_prandom_u64_max(total_weight);
+
+ lmv_foreach_connected_tgt(lmv, tgt) {
+ if (!tgt->ltd_qos.ltq_usable)
+ continue;
+
+ cur_weight += tgt->ltd_qos.ltq_weight;
+ if (cur_weight < rand)
+ continue;
+
+ ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight);
+ GOTO(unlock, tgt);
+ }
+
+ /* no proper target found */
+ GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+unlock:
+ up_write(&lmv->lmv_qos.lq_rw_sem);
+
+ return tgt;
+}
+
+static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv)
+{
+ time64_t now = ktime_get_seconds();
+ struct lu_tgt_desc *tgt;
+ int i;
+ int index;
+
+ ENTRY;
+
+ spin_lock(&lmv->lmv_lock);
+ for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++) {
+ index = (i + lmv->lmv_qos_rr_index) %
+ lmv->lmv_mdt_descs.ltd_tgts_size;
+ tgt = lmv_tgt(lmv, index);
+ if (!tgt || !tgt_qos_is_usable(lmv, tgt, now))
+ continue;
+
+ lmv->lmv_qos_rr_index = (tgt->ltd_index + 1) %
+ lmv->lmv_mdt_descs.ltd_tgts_size;
+ spin_unlock(&lmv->lmv_lock);
+
+ RETURN(tgt);
+ }
+ spin_unlock(&lmv->lmv_lock);
+
+ RETURN(ERR_PTR(-ENODEV));
+}
+
+/* locate MDT which is less full (avoid the most full MDT) */
+static struct lu_tgt_desc *lmv_locate_tgt_lf(struct lmv_obd *lmv)
+{
+ struct lu_tgt_desc *min = NULL;
+ struct lu_tgt_desc *tgt;
+ __u64 avail = 0;
+ __u64 rand;
+
+ ENTRY;
+
+ if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+ RETURN(ERR_PTR(-EAGAIN));
+
+ down_write(&lmv->lmv_qos.lq_rw_sem);
+
+ if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs))
+ GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+
+ lmv_foreach_tgt(lmv, tgt) {
+ if (!tgt->ltd_exp || !tgt->ltd_active ||
+ (tgt->ltd_statfs.os_state & OS_STATFS_NOCREATE)) {
+ tgt->ltd_qos.ltq_usable = 0;
+ continue;
+ }
+
+ tgt->ltd_qos.ltq_usable = 1;
+ lu_tgt_qos_weight_calc(tgt, true);
+ avail += tgt->ltd_qos.ltq_avail;
+ if (!min || min->ltd_qos.ltq_avail > tgt->ltd_qos.ltq_avail)
+ min = tgt;
+ }
+
+ /* avoid the most full MDT */
+ if (min)
+ avail -= min->ltd_qos.ltq_avail;
+
+ rand = lu_prandom_u64_max(avail);
+ avail = 0;
+ lmv_foreach_connected_tgt(lmv, tgt) {
+ if (!tgt->ltd_qos.ltq_usable)
+ continue;
+
+ if (tgt == min)
+ continue;
+
+ avail += tgt->ltd_qos.ltq_avail;
+ if (avail < rand)
+ continue;
+
+ GOTO(unlock, tgt);
+ }
+
+ /* no proper target found */
+ GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+unlock:
+ up_write(&lmv->lmv_qos.lq_rw_sem);
+
+ RETURN(tgt);
+}
+
+/* locate MDT by file name, for striped directory, the file name hash decides
+ * which stripe its dirent is stored.
+ */
+static struct lmv_tgt_desc *
+lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_object *lso,
+ const char *name, int namelen, struct lu_fid *fid,
+ __u32 *mds, bool new_layout)
+{
+ struct lmv_tgt_desc *tgt;
+ const struct lmv_oinfo *oinfo;
+
+ if (!lmv_dir_striped(lso) || !namelen) {
+ tgt = lmv_fid2tgt(lmv, fid);
+ if (IS_ERR(tgt))
+ return tgt;
+
+ *mds = tgt->ltd_index;
+ return tgt;
+ }
+
+ if (CFS_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) {
+ if (cfs_fail_val >= lso->lso_lsm.lsm_md_stripe_count)
+ return ERR_PTR(-EBADF);
+ oinfo = &lso->lso_lsm.lsm_md_oinfo[cfs_fail_val];
+ } else {
+ oinfo = lsm_name_to_stripe_info(lso, name, namelen, new_layout);
+ if (IS_ERR(oinfo))
+ return ERR_CAST(oinfo);
+ }
+
+ /* check stripe FID is sane */
+ if (!fid_is_sane(&oinfo->lmo_fid))
+ return ERR_PTR(-ENODEV);
+
+ *fid = oinfo->lmo_fid;
+ *mds = oinfo->lmo_mds;
+ tgt = lmv_tgt_retry(lmv, oinfo->lmo_mds);
+
+ CDEBUG(D_INODE, "locate MDT %u parent "DFID"\n", *mds, PFID(fid));
+
+ return tgt ? tgt : ERR_PTR(-ENODEV);
+}
+
+/**
+ * Locate MDT of op_data->op_fid1
+ *
+ * For striped directory, it will locate the stripe by name hash, if hash_type
+ * is unknown, it will return the stripe specified by 'op_data->op_stripe_index'
+ * which is set outside, and if dir is migrating, 'op_data->op_new_layout'
+ * indicates whether old or new layout is used to locate.
+ *
+ * For plain directory, it just locate the MDT of op_data->op_fid1.
+ *
+ * \param[in] lmv LMV device
+ * \param[in/out] op_data client MD stack parameters, name, namelen etc,
+ * op_mds and op_fid1 will be updated if op_lso1
+ * indicates fid1 represents a striped directory.
+ *
+ * retval pointer to the lmv_tgt_desc if succeed.
+ * ERR_PTR(errno) if failed.
+ */
+struct lmv_tgt_desc *
+lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+ struct lmv_stripe_md *lsm;
+ struct lmv_oinfo *oinfo;
+ struct lmv_tgt_desc *tgt;
+
+ if (lmv_dir_foreign(op_data->op_lso1))
+ return ERR_PTR(-ENODATA);
+
+ /* During creating VOLATILE file, it should honor the mdt
+ * index if the file under striped dir is being restored, see
+ * ct_restore().
+ */
+ if (op_data->op_bias & MDS_CREATE_VOLATILE &&
+ op_data->op_mds != LMV_OFFSET_DEFAULT) {
+ tgt = lmv_tgt(lmv, op_data->op_mds);
+ if (!tgt)
+ return ERR_PTR(-ENODEV);
+
+ if (lmv_dir_striped(op_data->op_lso1)) {
+ int i;
+
+ /* refill the right parent fid */
+ lsm = &op_data->op_lso1->lso_lsm;
+ for (i = 0; i < lsm->lsm_md_stripe_count; i++) {
+ oinfo = &lsm->lsm_md_oinfo[i];
+ if (oinfo->lmo_mds == op_data->op_mds) {
+ op_data->op_fid1 = oinfo->lmo_fid;
+ break;
+ }
+ }
+
+ if (i == lsm->lsm_md_stripe_count)
+ op_data->op_fid1 = lsm->lsm_md_oinfo[0].lmo_fid;
+ }
+ } else if (lmv_dir_bad_hash(op_data->op_lso1)) {
+ lsm = &op_data->op_lso1->lso_lsm;
+
+ LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count);
+ oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index];
+
+ op_data->op_fid1 = oinfo->lmo_fid;
+ op_data->op_mds = oinfo->lmo_mds;
+ tgt = lmv_tgt(lmv, oinfo->lmo_mds);
+ if (!tgt)
+ return ERR_PTR(-ENODEV);
+ } else {
+ tgt = lmv_locate_tgt_by_name(lmv, op_data->op_lso1,
+ op_data->op_name, op_data->op_namelen,
+ &op_data->op_fid1, &op_data->op_mds,
+ op_data->op_new_layout);
+ }
+
+ return tgt;
+}
+
+/* Locate MDT of op_data->op_fid2 for link/rename */
+static struct lmv_tgt_desc *
+lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+ struct lmv_tgt_desc *tgt;
+ int rc;
+
+ LASSERT(op_data->op_name);
+ if (lmv_dir_layout_changing(op_data->op_lso2)) {
+ struct lu_fid fid1 = op_data->op_fid1;
+ struct lmv_stripe_object *lso1 = op_data->op_lso1;
+ struct ptlrpc_request *request = NULL;
+
+ /*
+ * avoid creating new file under old layout of migrating
+ * directory, check it here.
+ */
+ tgt = lmv_locate_tgt_by_name(lmv, op_data->op_lso2,
+ op_data->op_name, op_data->op_namelen,
+ &op_data->op_fid2, &op_data->op_mds, false);
+ if (IS_ERR(tgt))
+ RETURN(tgt);
+
+ op_data->op_fid1 = op_data->op_fid2;
+ op_data->op_lso1 = op_data->op_lso2;
+ rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+ op_data->op_fid1 = fid1;
+ op_data->op_lso1 = lso1;
+ if (!rc) {
+ ptlrpc_req_put(request);
+ RETURN(ERR_PTR(-EEXIST));
+ }
+
+ if (rc != -ENOENT)
+ RETURN(ERR_PTR(rc));
+ }
+
+ return lmv_locate_tgt_by_name(lmv, op_data->op_lso2,
+ op_data->op_name, op_data->op_namelen,
+ &op_data->op_fid2, &op_data->op_mds,
+ true);
+}
+
+int lmv_old_layout_lookup(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+ struct lu_tgt_desc *tgt;
+ struct ptlrpc_request *request;
+ int rc;
+
+ LASSERT(lmv_dir_layout_changing(op_data->op_lso1));
+ LASSERT(!op_data->op_new_layout);
+
+ tgt = lmv_locate_tgt(lmv, op_data);
+ if (IS_ERR(tgt))
+ return PTR_ERR(tgt);
+
+ rc = md_getattr_name(tgt->ltd_exp, op_data, &request);
+ if (!rc) {
+ ptlrpc_req_put(request);
+ return -EEXIST;
+ }