X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Flmv%2Flmv_obd.c;h=4cd844fffdfa3b542ee0606ec8d8cb3eb202a78c;hb=5e6a30cc2f344e38dd11ef3db1eb23c1705d8f32;hp=b04d5a3ea5bd65b7da8ac07d9968f9bc0b7fa030;hpb=6d21fbbf018b697c9a42977508fac57d9e476877;p=fs%2Flustre-release.git diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index b04d5a3..4cd844f 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -60,9 +60,8 @@ static int lmv_check_connect(struct obd_device *obd); -static void lmv_activate_target(struct lmv_obd *lmv, - struct lmv_tgt_desc *tgt, - int activate) +void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt, + int activate) { if (tgt->ltd_active == activate) return; @@ -294,21 +293,21 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) int rc; ENTRY; - mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME, - &obd->obd_uuid); - if (!mdc_obd) { - CERROR("target %s not attached\n", tgt->ltd_uuid.uuid); - RETURN(-EINVAL); - } + mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME, + &obd->obd_uuid); + if (!mdc_obd) { + CERROR("target %s not attached\n", tgt->ltd_uuid.uuid); + RETURN(-EINVAL); + } CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s\n", mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, tgt->ltd_uuid.uuid, obd->obd_uuid.uuid); - if (!mdc_obd->obd_set_up) { - CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid); - RETURN(-EINVAL); - } + if (!mdc_obd->obd_set_up) { + CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid); + RETURN(-EINVAL); + } rc = obd_connect(NULL, &mdc_exp, mdc_obd, &obd->obd_uuid, &lmv->conn_data, lmv->lmv_cache); @@ -324,19 +323,19 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) if (rc) RETURN(rc); - target.ft_srv = NULL; - target.ft_exp = mdc_exp; - target.ft_idx = tgt->ltd_idx; + target.ft_srv = NULL; + target.ft_exp = mdc_exp; + target.ft_idx = tgt->ltd_index; - fld_client_add_target(&lmv->lmv_fld, &target); + fld_client_add_target(&lmv->lmv_fld, &target); - rc = obd_register_observer(mdc_obd, obd); - if (rc) { - obd_disconnect(mdc_exp); - CERROR("target %s register_observer error %d\n", - tgt->ltd_uuid.uuid, rc); - RETURN(rc); - } + rc = obd_register_observer(mdc_obd, obd); + if (rc) { + obd_disconnect(mdc_exp); + CERROR("target %s register_observer error %d\n", + tgt->ltd_uuid.uuid, rc); + RETURN(rc); + } if (obd->obd_observer) { /* @@ -356,10 +355,18 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize); + rc = lqos_add_tgt(&lmv->lmv_qos, tgt); + if (rc) { + obd_disconnect(mdc_exp); + RETURN(rc); + } + CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n", mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, atomic_read(&obd->obd_refcount)); + lmv_statfs_check_update(obd, tgt); + if (lmv->lmv_tgts_kobj) /* Even if we failed to create the link, that's fine */ rc = sysfs_create_link(lmv->lmv_tgts_kobj, @@ -373,6 +380,8 @@ static void lmv_del_target(struct lmv_obd *lmv, int index) if (lmv->tgts[index] == NULL) return; + lqos_del_tgt(&lmv->lmv_qos, lmv->tgts[index]); + OBD_FREE_PTR(lmv->tgts[index]); lmv->tgts[index] = NULL; return; @@ -382,11 +391,12 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, __u32 index, int gen) { struct obd_device *mdc_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - int orig_tgt_count = 0; - int rc = 0; - ENTRY; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int orig_tgt_count = 0; + int rc = 0; + + ENTRY; CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index); mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME, @@ -445,7 +455,7 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, } mutex_init(&tgt->ltd_fid_mutex); - tgt->ltd_idx = index; + tgt->ltd_index = index; tgt->ltd_uuid = *uuidp; tgt->ltd_active = 0; lmv->tgts[index] = tgt; @@ -660,6 +670,7 @@ repeat_fid2path: if (remote_gf != NULL) { struct getinfo_fid2path *ori_gf; char *ptr; + int len; ori_gf = (struct getinfo_fid2path *)karg; if (strlen(ori_gf->gf_u.gf_path) + 1 + @@ -668,13 +679,12 @@ repeat_fid2path: ptr = ori_gf->gf_u.gf_path; - memmove(ptr + strlen(gf->gf_u.gf_path) + 1, ptr, - strlen(ori_gf->gf_u.gf_path)); - - strncpy(ptr, gf->gf_u.gf_path, - strlen(gf->gf_u.gf_path)); - ptr += strlen(gf->gf_u.gf_path); - *ptr = '/'; + len = strlen(gf->gf_u.gf_path); + /* move the current path to the right to release space + * for closer-to-root part */ + memmove(ptr + len + 1, ptr, strlen(ori_gf->gf_u.gf_path)); + memcpy(ptr, gf->gf_u.gf_path, len); + ptr[len] = '/'; } CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n", @@ -1109,7 +1119,7 @@ hsm_req_err: RETURN(-EINVAL); /* only files on same MDT can have their layouts swapped */ - if (tgt1->ltd_idx != tgt2->ltd_idx) + if (tgt1->ltd_index != tgt2->ltd_index) RETURN(-EPERM); rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg); @@ -1156,41 +1166,52 @@ hsm_req_err: /** * This is _inode_ placement policy function (not name). */ -static int lmv_placement_policy(struct obd_device *obd, - struct md_op_data *op_data, u32 *mds) +static u32 lmv_placement_policy(struct obd_device *obd, + struct md_op_data *op_data) { - struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_obd *lmv = &obd->u.lmv; struct lmv_user_md *lum; + u32 mdt; ENTRY; - LASSERT(mds != NULL); - - if (lmv->desc.ld_tgt_count == 1) { - *mds = 0; + if (lmv->desc.ld_tgt_count == 1) RETURN(0); - } lum = op_data->op_data; - /* Choose MDS by + /* + * Choose MDT by * 1. See if the stripe offset is specified by lum. - * 2. Then check if there is default stripe offset. - * 3. Finally choose MDS by name hash if the parent - * is striped directory. (see lmv_locate_tgt()). */ + * 2. If parent has default LMV, and its hash type is "space", choose + * MDT with QoS. (see lmv_locate_tgt_qos()). + * 3. Then check if default LMV stripe offset is not -1. + * 4. Finally choose MDS by name hash if the parent + * is striped directory. (see lmv_locate_tgt()). + * + * presently explicit MDT location is not supported + * for foreign dirs (as it can't be embedded into free + * format LMV, like with lum_stripe_offset), so we only + * rely on default stripe offset or then name hashing. + */ if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL && + le32_to_cpu(lum->lum_magic != LMV_MAGIC_FOREIGN) && le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) { - *mds = le32_to_cpu(lum->lum_stripe_offset); - } else if (op_data->op_default_stripe_offset != (__u32)-1) { - *mds = op_data->op_default_stripe_offset; - op_data->op_mds = *mds; - /* Correct the stripe offset in lum */ - if (lum != NULL) - lum->lum_stripe_offset = cpu_to_le32(*mds); + mdt = le32_to_cpu(lum->lum_stripe_offset); + } else if (op_data->op_code == LUSTRE_OPC_MKDIR && + !lmv_dir_striped(op_data->op_mea1) && + lmv_dir_qos_mkdir(op_data->op_default_mea1)) { + mdt = op_data->op_mds; + } else if (op_data->op_code == LUSTRE_OPC_MKDIR && + op_data->op_default_mea1 && + op_data->op_default_mea1->lsm_md_master_mdt_index != + (__u32)-1) { + mdt = op_data->op_default_mea1->lsm_md_master_mdt_index; + op_data->op_mds = mdt; } else { - *mds = op_data->op_mds; + mdt = op_data->op_mds; } - RETURN(0); + RETURN(mdt); } int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) @@ -1221,45 +1242,42 @@ int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) rc = 0; } - EXIT; + EXIT; out: mutex_unlock(&tgt->ltd_fid_mutex); - return rc; + return rc; } int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, struct lu_fid *fid, struct md_op_data *op_data) { - struct obd_device *obd = class_exp2obd(exp); - struct lmv_obd *lmv = &obd->u.lmv; - u32 mds = 0; - int rc; - ENTRY; + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + u32 mds; + int rc; - LASSERT(op_data != NULL); - LASSERT(fid != NULL); + ENTRY; - rc = lmv_placement_policy(obd, op_data, &mds); - if (rc) { - CERROR("Can't get target for allocating fid, " - "rc %d\n", rc); - RETURN(rc); - } + LASSERT(op_data != NULL); + LASSERT(fid != NULL); - rc = __lmv_fid_alloc(lmv, fid, mds); - if (rc) { - CERROR("Can't alloc new fid, rc %d\n", rc); - RETURN(rc); - } + mds = lmv_placement_policy(obd, op_data); - RETURN(rc); + rc = __lmv_fid_alloc(lmv, fid, mds); + if (rc) + CERROR("Can't alloc new fid, rc %d\n", rc); + + RETURN(rc); } static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) { - struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_obd *lmv = &obd->u.lmv; struct lmv_desc *desc; - int rc; + struct lnet_process_id lnet_id; + int i = 0; + int rc; + ENTRY; if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { @@ -1282,12 +1300,36 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid); lmv->desc.ld_tgt_count = 0; lmv->desc.ld_active_tgt_count = 0; + lmv->desc.ld_qos_maxage = LMV_DESC_QOS_MAXAGE_DEFAULT; lmv->max_def_easize = 0; lmv->max_easize = 0; spin_lock_init(&lmv->lmv_lock); mutex_init(&lmv->lmv_init_mutex); + /* Set up allocation policy (QoS and RR) */ + INIT_LIST_HEAD(&lmv->lmv_qos.lq_svr_list); + init_rwsem(&lmv->lmv_qos.lq_rw_sem); + lmv->lmv_qos.lq_dirty = 1; + lmv->lmv_qos.lq_reset = 1; + /* Default priority is toward free space balance */ + lmv->lmv_qos.lq_prio_free = 232; + /* Default threshold for rr (roughly 17%) */ + lmv->lmv_qos.lq_threshold_rr = 43; + + lu_qos_rr_init(&lmv->lmv_qos.lq_rr); + + /* + * initialize rr_index to lower 32bit of netid, so that client + * can distribute subdirs evenly from the beginning. + */ + while (LNetGetId(i++, &lnet_id) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) { + lmv->lmv_qos_rr_index = (u32)lnet_id.nid; + break; + } + } + rc = lmv_tunables_init(obd); if (rc) CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n", @@ -1447,6 +1489,48 @@ out_free_temp: return rc; } +static int lmv_statfs_update(void *cookie, int rc) +{ + struct obd_info *oinfo = cookie; + struct obd_device *obd = oinfo->oi_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = oinfo->oi_tgt; + struct obd_statfs *osfs = oinfo->oi_osfs; + + /* + * NB: don't deactivate TGT upon error, because we may not trigger async + * statfs any longer, then there is no chance to activate TGT. + */ + if (!rc) { + spin_lock(&lmv->lmv_lock); + tgt->ltd_statfs = *osfs; + tgt->ltd_statfs_age = ktime_get_seconds(); + spin_unlock(&lmv->lmv_lock); + lmv->lmv_qos.lq_dirty = 1; + } + + return rc; +} + +/* update tgt statfs async if it's ld_qos_maxage old */ +int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt) +{ + struct obd_info oinfo = { + .oi_obd = obd, + .oi_tgt = tgt, + .oi_cb_up = lmv_statfs_update, + }; + int rc; + + if (ktime_get_seconds() - tgt->ltd_statfs_age < + obd->u.lmv.desc.ld_qos_maxage) + return 0; + + rc = obd_statfs_async(tgt->ltd_exp, &oinfo, 0, NULL); + + return rc; +} + static int lmv_get_root(struct obd_export *exp, const char *fileset, struct lu_fid *fid) { @@ -1514,7 +1598,7 @@ static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data, RETURN(PTR_ERR(tgt)); if (op_data->op_flags & MF_GET_MDT_IDX) { - op_data->op_mds = tgt->ltd_idx; + op_data->op_mds = tgt->ltd_index; RETURN(0); } @@ -1564,21 +1648,20 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, RETURN(rc); } -struct lmv_tgt_desc* -__lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, - const char *name, int namelen, struct lu_fid *fid, u32 *mds, - bool post_migrate) +static struct lmv_tgt_desc * +lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, + const char *name, int namelen, struct lu_fid *fid, + __u32 *mds, bool post_migrate) { struct lmv_tgt_desc *tgt; const struct lmv_oinfo *oinfo; - if (lsm == NULL || namelen == 0) { + if (!lmv_dir_striped(lsm) || !namelen) { tgt = lmv_find_target(lmv, fid); if (IS_ERR(tgt)) return tgt; - LASSERT(mds); - *mds = tgt->ltd_idx; + *mds = tgt->ltd_index; return tgt; } @@ -1593,46 +1676,43 @@ __lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, return ERR_CAST(oinfo); } - if (fid != NULL) - *fid = oinfo->lmo_fid; - if (mds != NULL) - *mds = oinfo->lmo_mds; - + *fid = oinfo->lmo_fid; + *mds = oinfo->lmo_mds; tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL); - CDEBUG(D_INFO, "locate on mds %u "DFID"\n", oinfo->lmo_mds, - PFID(&oinfo->lmo_fid)); + CDEBUG(D_INODE, "locate MDT %u parent "DFID"\n", *mds, PFID(fid)); return tgt; } - /** - * Locate mdt by fid or name + * Locate MDT of op_data->op_fid1 * * For striped directory, it will locate the stripe by name hash, if hash_type * is unknown, it will return the stripe specified by 'op_data->op_stripe_index' * which is set outside, and if dir is migrating, 'op_data->op_post_migrate' * indicates whether old or new layout is used to locate. * - * For normal direcotry, it will locate MDS by FID directly. + * For plain direcotry, normally it will locate MDT by FID, but if this + * directory has default LMV, and its hash type is "space", locate MDT with QoS. * * \param[in] lmv LMV device * \param[in] op_data client MD stack parameters, name, namelen * mds_num etc. - * \param[in] fid object FID used to locate MDS. * * retval pointer to the lmv_tgt_desc if succeed. * ERR_PTR(errno) if failed. */ -struct lmv_tgt_desc* -lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data, - struct lu_fid *fid) +struct lmv_tgt_desc * +lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data) { struct lmv_stripe_md *lsm = op_data->op_mea1; struct lmv_oinfo *oinfo; struct lmv_tgt_desc *tgt; + if (lmv_dir_foreign(lsm)) + return ERR_PTR(-ENODATA); + /* During creating VOLATILE file, it should honor the mdt * index if the file under striped dir is being restored, see * ct_restore(). */ @@ -1642,62 +1722,128 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data, if (IS_ERR(tgt)) return tgt; - if (lsm) { + if (lmv_dir_striped(lsm)) { int i; /* refill the right parent fid */ for (i = 0; i < lsm->lsm_md_stripe_count; i++) { oinfo = &lsm->lsm_md_oinfo[i]; if (oinfo->lmo_mds == op_data->op_mds) { - *fid = oinfo->lmo_fid; + op_data->op_fid1 = oinfo->lmo_fid; break; } } if (i == lsm->lsm_md_stripe_count) - *fid = lsm->lsm_md_oinfo[0].lmo_fid; + op_data->op_fid1 = lsm->lsm_md_oinfo[0].lmo_fid; } - } else if (lmv_is_dir_bad_hash(lsm)) { + } else if (lmv_dir_bad_hash(lsm)) { LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count); oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index]; - *fid = oinfo->lmo_fid; + op_data->op_fid1 = oinfo->lmo_fid; op_data->op_mds = oinfo->lmo_mds; tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL); + } else if (op_data->op_code == LUSTRE_OPC_MKDIR && + lmv_dir_qos_mkdir(op_data->op_default_mea1) && + !lmv_dir_striped(lsm)) { + tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds); + if (tgt == ERR_PTR(-EAGAIN)) + tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds); + /* + * only update statfs when mkdir under dir with "space" hash, + * this means the cached statfs may be stale, and current mkdir + * may not follow QoS accurately, but it's not serious, and it + * avoids periodic statfs when client doesn't mkdir under + * "space" hashed directories. + * + * TODO: after MDT support QoS object allocation, also update + * statfs for 'lfs mkdir -i -1 ...", currently it's done in user + * space. + */ + if (!IS_ERR(tgt)) { + struct obd_device *obd; + + obd = container_of(lmv, struct obd_device, u.lmv); + lmv_statfs_check_update(obd, tgt); + } } else { - tgt = __lmv_locate_tgt(lmv, lsm, op_data->op_name, - op_data->op_namelen, fid, - &op_data->op_mds, - op_data->op_post_migrate); + tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1, + op_data->op_name, op_data->op_namelen, + &op_data->op_fid1, &op_data->op_mds, + op_data->op_post_migrate); } return tgt; } +/* Locate MDT of op_data->op_fid2 for link/rename */ +static struct lmv_tgt_desc * +lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lmv_tgt_desc *tgt; + int rc; + + LASSERT(op_data->op_name); + if (lmv_dir_migrating(op_data->op_mea2)) { + struct lu_fid fid1 = op_data->op_fid1; + struct lmv_stripe_md *lsm1 = op_data->op_mea1; + struct ptlrpc_request *request = NULL; + + /* + * avoid creating new file under old layout of migrating + * directory, check it here. + */ + tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea2, + op_data->op_name, op_data->op_namelen, + &op_data->op_fid2, &op_data->op_mds, false); + if (IS_ERR(tgt)) + RETURN(tgt); + + op_data->op_fid1 = op_data->op_fid2; + op_data->op_mea1 = op_data->op_mea2; + rc = md_getattr_name(tgt->ltd_exp, op_data, &request); + op_data->op_fid1 = fid1; + op_data->op_mea1 = lsm1; + if (!rc) { + ptlrpc_req_finished(request); + RETURN(ERR_PTR(-EEXIST)); + } + + if (rc != -ENOENT) + RETURN(ERR_PTR(rc)); + } + + return lmv_locate_tgt_by_name(lmv, op_data->op_mea2, op_data->op_name, + op_data->op_namelen, &op_data->op_fid2, + &op_data->op_mds, true); +} + int lmv_create(struct obd_export *exp, struct md_op_data *op_data, const void *data, size_t datalen, umode_t mode, uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev, struct ptlrpc_request **request) { - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - int rc; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; if (!lmv->desc.ld_active_tgt_count) RETURN(-EIO); - if (lmv_is_dir_bad_hash(op_data->op_mea1)) + if (lmv_dir_bad_hash(op_data->op_mea1)) RETURN(-EBADF); - if (lmv_is_dir_migrating(op_data->op_mea1)) { + if (lmv_dir_migrating(op_data->op_mea1)) { /* * if parent is migrating, create() needs to lookup existing * name, to avoid creating new file under old layout of * migrating directory, check old layout here. */ - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -1714,7 +1860,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, op_data->op_post_migrate = true; } - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -1733,9 +1879,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); - op_data->op_mds = tgt->ltd_idx; - } else { - CDEBUG(D_CONFIG, "Server doesn't support striped dirs\n"); + op_data->op_mds = tgt->ltd_index; } CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n", @@ -1770,7 +1914,7 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, RETURN(PTR_ERR(tgt)); CDEBUG(D_INODE, "ENQUEUE on "DFID" -> mds #%u\n", - PFID(&op_data->op_fid1), tgt->ltd_idx); + PFID(&op_data->op_fid1), tgt->ltd_index); rc = md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh, extra_lock_flags); @@ -1791,13 +1935,13 @@ lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data, ENTRY; retry: - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n", (int)op_data->op_namelen, op_data->op_name, - PFID(&op_data->op_fid1), tgt->ltd_idx); + PFID(&op_data->op_fid1), tgt->ltd_index); rc = md_getattr_name(tgt->ltd_exp, op_data, preq); if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) { @@ -1853,7 +1997,7 @@ static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt, RETURN(PTR_ERR(tgt)); } - if (tgt->ltd_idx != op_tgt) { + if (tgt->ltd_index != op_tgt) { CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid)); policy.l_inodebits.bits = bits; rc = md_cancel_unused(tgt->ltd_exp, fid, &policy, @@ -1892,39 +2036,7 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); op_data->op_cap = cfs_curproc_cap_pack(); - if (lmv_is_dir_migrating(op_data->op_mea2)) { - struct lu_fid fid1 = op_data->op_fid1; - struct lmv_stripe_md *lsm1 = op_data->op_mea1; - - /* - * avoid creating new file under old layout of migrating - * directory, check it here. - */ - tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name, - op_data->op_namelen, &op_data->op_fid2, - &op_data->op_mds, false); - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt)) - RETURN(PTR_ERR(tgt)); - - op_data->op_fid1 = op_data->op_fid2; - op_data->op_mea1 = op_data->op_mea2; - rc = md_getattr_name(tgt->ltd_exp, op_data, request); - op_data->op_fid1 = fid1; - op_data->op_mea1 = lsm1; - if (!rc) { - ptlrpc_req_finished(*request); - *request = NULL; - RETURN(-EEXIST); - } - - if (rc != -ENOENT) - RETURN(rc); - } - - tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name, - op_data->op_namelen, &op_data->op_fid2, - &op_data->op_mds, true); + tgt = lmv_locate_tgt2(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -1932,7 +2044,7 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, * Cancel UPDATE lock on child (fid1). */ op_data->op_flags |= MF_MDC_CANCEL_FID2; - rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX, + rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); if (rc != 0) RETURN(rc); @@ -1972,7 +2084,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, if (IS_ERR(parent_tgt)) RETURN(PTR_ERR(parent_tgt)); - if (lsm) { + if (lmv_dir_striped(lsm)) { __u32 hash_type = lsm->lsm_md_hash_type; __u32 stripe_count = lsm->lsm_md_stripe_count; @@ -1980,7 +2092,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, * old stripes are appended after new stripes for migrating * directory. */ - if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) { + if (lmv_dir_migrating(lsm)) { hash_type = lsm->lsm_md_migrate_hash; stripe_count -= lsm->lsm_md_migrate_offset; } @@ -1990,7 +2102,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, if (rc < 0) RETURN(rc); - if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) + if (lmv_dir_migrating(lsm)) rc += lsm->lsm_md_migrate_offset; /* save it in fid4 temporarily for early cancel */ @@ -2004,7 +2116,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, * if parent is being migrated too, fill op_fid2 with target * stripe fid, otherwise the target stripe is not created yet. */ - if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) { + if (lmv_dir_migrating(lsm)) { hash_type = lsm->lsm_md_hash_type & ~LMV_HASH_FLAG_MIGRATION; stripe_count = lsm->lsm_md_migrate_offset; @@ -2030,7 +2142,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, RETURN(PTR_ERR(child_tgt)); if (!S_ISDIR(op_data->op_mode) && tp_tgt) - rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_idx); + rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_index); else rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data); if (rc) @@ -2056,7 +2168,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, } /* cancel UPDATE lock of parent master object */ - rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, LCK_EX, + rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); if (rc) RETURN(rc); @@ -2081,14 +2193,14 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, op_data->op_fid4 = target_fid; /* cancel UPDATE locks of target parent */ - rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX, + rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2); if (rc) RETURN(rc); /* cancel LOOKUP lock of source if source is remote object */ if (child_tgt != sp_tgt) { - rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx, + rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_LOOKUP, MF_MDC_CANCEL_FID3); if (rc) @@ -2096,7 +2208,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, } /* cancel ELC locks of source */ - rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_idx, LCK_EX, + rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3); if (rc) RETURN(rc); @@ -2133,44 +2245,10 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); op_data->op_cap = cfs_curproc_cap_pack(); - if (lmv_is_dir_migrating(op_data->op_mea2)) { - struct lu_fid fid1 = op_data->op_fid1; - struct lmv_stripe_md *lsm1 = op_data->op_mea1; - - /* - * we avoid creating new file under old layout of migrating - * directory, if there is an existing file with new name under - * old layout, we can't unlink file in old layout and rename to - * new layout in one transaction, so return -EBUSY here.` - */ - tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen, - &op_data->op_fid2, &op_data->op_mds, - false); - if (IS_ERR(tgt)) - RETURN(PTR_ERR(tgt)); + op_data->op_name = new; + op_data->op_namelen = newlen; - op_data->op_fid1 = op_data->op_fid2; - op_data->op_mea1 = op_data->op_mea2; - op_data->op_name = new; - op_data->op_namelen = newlen; - rc = md_getattr_name(tgt->ltd_exp, op_data, request); - op_data->op_fid1 = fid1; - op_data->op_mea1 = lsm1; - op_data->op_name = NULL; - op_data->op_namelen = 0; - if (!rc) { - ptlrpc_req_finished(*request); - *request = NULL; - RETURN(-EBUSY); - } - - if (rc != -ENOENT) - RETURN(rc); - } - - /* rename to new layout for migrating directory */ - tp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen, - &op_data->op_fid2, &op_data->op_mds, true); + tp_tgt = lmv_locate_tgt2(lmv, op_data); if (IS_ERR(tp_tgt)) RETURN(PTR_ERR(tp_tgt)); @@ -2190,7 +2268,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, op_data->op_flags |= MF_MDC_CANCEL_FID4; /* cancel UPDATE locks of target parent */ - rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX, + rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2); if (rc != 0) RETURN(rc); @@ -2199,7 +2277,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, /* cancel LOOKUP lock of target on target parent */ if (tgt != tp_tgt) { rc = lmv_early_cancel(exp, tp_tgt, op_data, - tgt->ltd_idx, LCK_EX, + tgt->ltd_index, LCK_EX, MDS_INODELOCK_LOOKUP, MF_MDC_CANCEL_FID4); if (rc != 0) @@ -2213,22 +2291,22 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, RETURN(PTR_ERR(src_tgt)); /* cancel ELC locks of source */ - rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_idx, + rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3); if (rc != 0) RETURN(rc); } + op_data->op_name = old; + op_data->op_namelen = oldlen; retry: - sp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea1, old, oldlen, - &op_data->op_fid1, &op_data->op_mds, - op_data->op_post_migrate); + sp_tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(sp_tgt)) RETURN(PTR_ERR(sp_tgt)); /* cancel UPDATE locks of source parent */ - rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx, LCK_EX, + rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); if (rc != 0) RETURN(rc); @@ -2237,7 +2315,7 @@ retry: /* cancel LOOKUP lock of source on source parent */ if (src_tgt != sp_tgt) { rc = lmv_early_cancel(exp, sp_tgt, op_data, - tgt->ltd_idx, LCK_EX, + tgt->ltd_index, LCK_EX, MDS_INODELOCK_LOOKUP, MF_MDC_CANCEL_FID3); if (rc != 0) @@ -2282,7 +2360,7 @@ rename: /* cancel LOOKUP lock of target on target parent */ if (tgt != tp_tgt) { rc = lmv_early_cancel(exp, tp_tgt, op_data, - tgt->ltd_idx, LCK_EX, + tgt->ltd_index, LCK_EX, MDS_INODELOCK_LOOKUP, MF_MDC_CANCEL_FID4); if (rc != 0) @@ -2429,6 +2507,11 @@ static struct lu_dirent *stripe_dirent_load(struct lmv_dir_ctxt *ctxt, } oinfo = &op_data->op_mea1->lsm_md_oinfo[stripe_index]; + if (!oinfo->lmo_root) { + rc = -ENOENT; + break; + } + tgt = lmv_get_target(ctxt->ldc_lmv, oinfo->lmo_mds, NULL); if (IS_ERR(tgt)) { rc = PTR_ERR(tgt); @@ -2688,14 +2771,17 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data, struct md_callback *cb_op, __u64 offset, struct page **ppage) { - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_stripe_md *lsm = op_data->op_mea1; - struct lmv_tgt_desc *tgt; - int rc; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; - if (unlikely(lsm != NULL)) { + if (unlikely(lmv_dir_foreign(op_data->op_mea1))) + RETURN(-ENODATA); + + if (unlikely(lmv_dir_striped(op_data->op_mea1))) { rc = lmv_striped_read_page(exp, op_data, cb_op, offset, ppage); RETURN(rc); } @@ -2750,7 +2836,7 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data, op_data->op_cap = cfs_curproc_cap_pack(); retry: - parent_tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + parent_tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(parent_tgt)) RETURN(PTR_ERR(parent_tgt)); @@ -2772,17 +2858,18 @@ retry: op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; if (parent_tgt != tgt) - rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, + rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_LOOKUP, MF_MDC_CANCEL_FID3); - rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX, + rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX, MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3); if (rc) RETURN(rc); CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n", - PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx); + PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), + tgt->ltd_index); rc = md_unlink(tgt->ltd_exp, op_data, request); if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) { @@ -2985,10 +3072,22 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm, for (i = 0; i < stripe_count; i++) { fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid, &lmm1->lmv_stripe_fids[i]); + /* + * set default value -1, so lmv_locate_tgt() knows this stripe + * target is not initialized. + */ + lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1; + if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid)) + continue; + rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid, &lsm->lsm_md_oinfo[i].lmo_mds); - if (rc != 0) + if (rc == -ENOENT) + continue; + + if (rc) RETURN(rc); + CDEBUG(D_INFO, "unpack fid #%d "DFID"\n", i, PFID(&lsm->lsm_md_oinfo[i].lmo_fid)); } @@ -2996,6 +3095,18 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm, RETURN(rc); } +static inline int lmv_unpack_user_md(struct obd_export *exp, + struct lmv_stripe_md *lsm, + const struct lmv_user_md *lmu) +{ + lsm->lsm_md_magic = le32_to_cpu(lmu->lum_magic); + lsm->lsm_md_stripe_count = le32_to_cpu(lmu->lum_stripe_count); + lsm->lsm_md_master_mdt_index = le32_to_cpu(lmu->lum_stripe_offset); + lsm->lsm_md_hash_type = le32_to_cpu(lmu->lum_hash_type); + + return 0; +} + static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp, const union lmv_mds_md *lmm, size_t lmm_size) { @@ -3011,15 +3122,50 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp, /* Free memmd */ if (lsm != NULL && lmm == NULL) { int i; + struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lsm; - for (i = 0; i < lsm->lsm_md_stripe_count; i++) - iput(lsm->lsm_md_oinfo[i].lmo_root); - lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count); + if (lfm->lfm_magic == LMV_MAGIC_FOREIGN) { + size_t lfm_size; + + lfm_size = lfm->lfm_length + offsetof(typeof(*lfm), + lfm_value[0]); + OBD_FREE_LARGE(lfm, lfm_size); + RETURN(0); + } + + if (lmv_dir_striped(lsm)) { + for (i = 0; i < lsm->lsm_md_stripe_count; i++) { + if (lsm->lsm_md_oinfo[i].lmo_root) + iput(lsm->lsm_md_oinfo[i].lmo_root); + } + lsm_size = lmv_stripe_md_size(lsm->lsm_md_stripe_count); + } else { + lsm_size = lmv_stripe_md_size(0); + } OBD_FREE(lsm, lsm_size); *lsmp = NULL; RETURN(0); } + /* foreign lmv case */ + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_FOREIGN) { + struct lmv_foreign_md *lfm = (struct lmv_foreign_md *)lsm; + + if (lfm == NULL) { + OBD_ALLOC_LARGE(lfm, lmm_size); + if (lfm == NULL) + RETURN(-ENOMEM); + *lsmp = (struct lmv_stripe_md *)lfm; + } + lfm->lfm_magic = le32_to_cpu(lmm->lmv_foreign_md.lfm_magic); + lfm->lfm_length = le32_to_cpu(lmm->lmv_foreign_md.lfm_length); + lfm->lfm_type = le32_to_cpu(lmm->lmv_foreign_md.lfm_type); + lfm->lfm_flags = le32_to_cpu(lmm->lmv_foreign_md.lfm_flags); + memcpy(&lfm->lfm_value, &lmm->lmv_foreign_md.lfm_value, + lfm->lfm_length); + RETURN(lmm_size); + } + if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_STRIPE) RETURN(-EPERM); @@ -3041,7 +3187,6 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp, */ lsm_size = lmv_stripe_md_size(0); - lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm)); if (lsm == NULL) { OBD_ALLOC(lsm, lsm_size); if (lsm == NULL) @@ -3054,6 +3199,9 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp, case LMV_MAGIC_V1: rc = lmv_unpack_md_v1(exp, lsm, &lmm->lmv_md_v1); break; + case LMV_USER_MAGIC: + rc = lmv_unpack_user_md(exp, lsm, &lmm->lmv_user_md); + break; default: CERROR("%s: unrecognized magic %x\n", exp->exp_obd->obd_name, le32_to_cpu(lmm->lmv_magic)); @@ -3180,6 +3328,10 @@ int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md) struct lmv_tgt_desc *tgt = lmv->tgts[0]; ENTRY; + if (md->default_lmv) { + lmv_free_memmd(md->default_lmv); + md->default_lmv = NULL; + } if (md->lmv != NULL) { lmv_free_memmd(md->lmv); md->lmv = NULL; @@ -3273,7 +3425,8 @@ int lmv_get_fid_from_lsm(struct obd_export *exp, { const struct lmv_oinfo *oinfo; - LASSERT(lsm != NULL); + LASSERT(lmv_dir_striped(lsm)); + oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false); if (IS_ERR(oinfo)) return PTR_ERR(oinfo); @@ -3344,6 +3497,9 @@ static int lmv_merge_attr(struct obd_export *exp, int rc; int i; + if (!lmv_dir_striped(lsm)) + return 0; + rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0); if (rc < 0) return rc; @@ -3351,6 +3507,9 @@ static int lmv_merge_attr(struct obd_export *exp, for (i = 0; i < lsm->lsm_md_stripe_count; i++) { struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root; + if (!inode) + continue; + CDEBUG(D_INFO, "" DFID " size %llu, blocks %llu nlink %u, atime %lld ctime %lld, mtime %lld.\n", PFID(&lsm->lsm_md_oinfo[i].lmo_fid), @@ -3392,6 +3551,7 @@ struct obd_ops lmv_obd_ops = { .o_set_info_async = lmv_set_info_async, .o_notify = lmv_notify, .o_get_uuid = lmv_get_uuid, + .o_fid_alloc = lmv_fid_alloc, .o_iocontrol = lmv_iocontrol, .o_quotactl = lmv_quotactl };