return lu_device_is_cl(o->lo_dev);
}
+/* Generic subset of OSTs */
+struct ost_pool {
+ __u32 *op_array; /* array of index of
+ * lov_obd->lov_tgts */
+ unsigned int op_count; /* number of OSTs in the array */
+ unsigned int op_size; /* allocated size of lp_array */
+ struct rw_semaphore op_rw_sem; /* to protect ost_pool use */
+};
+
+/* round-robin QoS data for LOD/LMV */
+struct lu_qos_rr {
+ spinlock_t lqr_alloc; /* protect allocation index */
+ __u32 lqr_start_idx; /* start index of new inode */
+ __u32 lqr_offset_idx;/* aliasing for start_idx */
+ int lqr_start_count;/* reseed counter */
+ struct ost_pool lqr_pool; /* round-robin optimized list */
+ unsigned long lqr_dirty:1; /* recalc round-robin list */
+};
+
+/* QoS data per MDS/OSS */
+struct lu_svr_qos {
+ struct obd_uuid lsq_uuid; /* ptlrpc's c_remote_uuid */
+ struct list_head lsq_svr_list; /* link to lq_svr_list */
+ __u64 lsq_bavail; /* total bytes avail on svr */
+ __u64 lsq_iavail; /* tital inode avail on svr */
+ __u64 lsq_penalty; /* current penalty */
+ __u64 lsq_penalty_per_obj; /* penalty decrease
+ * every obj*/
+ time64_t lsq_used; /* last used time, seconds */
+ __u32 lsq_tgt_count; /* number of tgts on this svr */
+ __u32 lsq_id; /* unique svr id */
+};
+
+/* QoS data per MDT/OST */
+struct lu_tgt_qos {
+ struct lu_svr_qos *ltq_svr; /* svr info */
+ __u64 ltq_penalty; /* current penalty */
+ __u64 ltq_penalty_per_obj; /* penalty decrease
+ * every obj*/
+ __u64 ltq_weight; /* net weighting */
+ time64_t ltq_used; /* last used time, seconds */
+ bool ltq_usable:1; /* usable for striping */
+};
+
+/* target descriptor */
+struct lu_tgt_desc {
+ union {
+ struct dt_device *ltd_tgt;
+ struct obd_device *ltd_obd;
+ };
+ struct obd_export *ltd_exp;
+ struct obd_uuid ltd_uuid;
+ __u32 ltd_index;
+ __u32 ltd_gen;
+ struct list_head ltd_kill;
+ struct ptlrpc_thread *ltd_recovery_thread;
+ struct mutex ltd_fid_mutex;
+ struct lu_tgt_qos ltd_qos; /* qos info per target */
+ struct obd_statfs ltd_statfs;
+ time64_t ltd_statfs_age;
+ unsigned long ltd_active:1,/* is this target up for requests */
+ ltd_activate:1,/* should target be activated */
+ ltd_reap:1, /* should this target be deleted */
+ ltd_got_update_log:1, /* Already got update log */
+ ltd_connecting:1; /* target is connecting */
+};
+
+/* QoS data for LOD/LMV */
+struct lu_qos {
+ struct list_head lq_svr_list; /* lu_svr_qos list */
+ struct rw_semaphore lq_rw_sem;
+ __u32 lq_active_svr_count;
+ unsigned int lq_prio_free; /* priority for free space */
+ unsigned int lq_threshold_rr;/* priority for rr */
+ struct lu_qos_rr lq_rr; /* round robin qos data */
+ unsigned long lq_dirty:1, /* recalc qos data */
+ lq_same_space:1,/* the servers all have approx.
+ * the same space avail */
+ lq_reset:1; /* zero current penalties */
+};
+
+int lqos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
+int lqos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
+
/** @} lu */
#endif /* __LUSTRE_LU_OBJECT_H */
/* OBD_STATFS_* flags */
__u64 oi_flags;
struct obd_device *oi_obd;
- struct lmv_tgt_desc *oi_tgt;
+ struct lu_tgt_desc *oi_tgt;
/* statfs data specific for every OSC, if needed at all. */
struct obd_statfs *oi_osfs;
/* An update callback which is called to update some data on upper
__u64 ec_unique;
};
-/* Generic subset of OSTs */
-struct ost_pool {
- __u32 *op_array; /* array of index of
- lov_obd->lov_tgts */
- unsigned int op_count; /* number of OSTs in the array */
- unsigned int op_size; /* allocated size of lp_array */
- struct rw_semaphore op_rw_sem; /* to protect ost_pool use */
-};
-
/* allow statfs data caching for 1 second */
#define OBD_STATFS_CACHE_SECONDS 1
-struct lov_tgt_desc {
- struct list_head ltd_kill;
- struct obd_uuid ltd_uuid;
- struct obd_device *ltd_obd;
- struct obd_export *ltd_exp;
- __u32 ltd_gen;
- __u32 ltd_index; /* index in lov_obd->tgts */
- unsigned long ltd_active:1,/* is this target up for requests */
- ltd_activate:1,/* should target be activated */
- ltd_reap:1; /* should this target be deleted */
-};
+#define lov_tgt_desc lu_tgt_desc
struct lov_md_tgt_desc {
struct obd_device *lmtd_mdc;
struct kobject *lov_tgts_kobj;
};
-struct lmv_tgt_desc {
- struct obd_uuid ltd_uuid;
- struct obd_device *ltd_obd;
- struct obd_export *ltd_exp;
- __u32 ltd_idx;
- struct mutex ltd_fid_mutex;
- struct obd_statfs ltd_statfs;
- time64_t ltd_statfs_age;
- unsigned long ltd_active:1; /* target up for requests */
-};
+#define lmv_tgt_desc lu_tgt_desc
struct lmv_obd {
struct lu_client_fld lmv_fld;
struct obd_connect_data conn_data;
struct kobject *lmv_tgts_kobj;
void *lmv_cache;
+
+ struct lu_qos lmv_qos;
+ __u32 lmv_qos_rr_index;
};
/* Minimum sector size is 512 */
__u16 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
};
+#define LMV_DESC_QOS_MAXAGE_DEFAULT 60 /* Seconds */
+
/* lmv structures */
struct lmv_desc {
__u32 ld_tgt_count; /* how many MDS's */
MODULES := lmv
-lmv-objs := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o
+lmv-objs := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o lmv_qos.o
@INCLUDE_RULES@
op_data->op_bias = MDS_CROSS_REF;
CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n",
- PFID(&body->mbo_fid1), tgt->ltd_idx);
+ PFID(&body->mbo_fid1), tgt->ltd_index);
/* ask for security context upon intent */
if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) &&
GOTO(cleanup, rc = PTR_ERR(tgt));
CDEBUG(D_INODE, "Revalidate slave "DFID" -> mds #%u\n",
- PFID(&fid), tgt->ltd_idx);
+ PFID(&fid), tgt->ltd_index);
if (req != NULL) {
ptlrpc_req_finished(req);
if (IS_ERR(tgt))
RETURN(PTR_ERR(tgt));
- op_data->op_mds = tgt->ltd_idx;
+ op_data->op_mds = tgt->ltd_index;
} else {
LASSERT(fid_is_sane(&op_data->op_fid1));
LASSERT(fid_is_zero(&op_data->op_fid2));
CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
" name='%s' -> mds #%u\n", PFID(&op_data->op_fid1),
- PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+ PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_index);
rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking,
extra_lock_flags);
", name='%s' -> mds #%u\n",
PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
op_data->op_name ? op_data->op_name : "<NULL>",
- tgt->ltd_idx);
+ tgt->ltd_index);
op_data->op_bias &= ~MDS_CROSS_REF;
int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
struct ptlrpc_request **preq);
+void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
+ int activate);
int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt);
if (lmv->tgts[i] == NULL)
continue;
- if (lmv->tgts[i]->ltd_idx == mdt_idx) {
+ if (lmv->tgts[i]->ltd_index == mdt_idx) {
if (index != NULL)
*index = i;
return lmv->tgts[i];
struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
struct md_op_data *op_data);
+/* lmv_qos.c */
+struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt);
+struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt);
+
/* lproc_lmv.c */
int lmv_tunables_init(struct obd_device *obd);
static int lmv_check_connect(struct obd_device *obd);
-static void lmv_activate_target(struct lmv_obd *lmv,
- struct lmv_tgt_desc *tgt,
- int activate)
+void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
+ int activate)
{
if (tgt->ltd_active == activate)
return;
int rc;
ENTRY;
- mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
- &obd->obd_uuid);
- if (!mdc_obd) {
- CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
- RETURN(-EINVAL);
- }
+ mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+ &obd->obd_uuid);
+ if (!mdc_obd) {
+ CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+ RETURN(-EINVAL);
+ }
CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s\n",
mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
tgt->ltd_uuid.uuid, obd->obd_uuid.uuid);
- if (!mdc_obd->obd_set_up) {
- CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
- RETURN(-EINVAL);
- }
+ if (!mdc_obd->obd_set_up) {
+ CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+ RETURN(-EINVAL);
+ }
rc = obd_connect(NULL, &mdc_exp, mdc_obd, &obd->obd_uuid,
&lmv->conn_data, lmv->lmv_cache);
if (rc)
RETURN(rc);
- target.ft_srv = NULL;
- target.ft_exp = mdc_exp;
- target.ft_idx = tgt->ltd_idx;
+ target.ft_srv = NULL;
+ target.ft_exp = mdc_exp;
+ target.ft_idx = tgt->ltd_index;
- fld_client_add_target(&lmv->lmv_fld, &target);
+ fld_client_add_target(&lmv->lmv_fld, &target);
- rc = obd_register_observer(mdc_obd, obd);
- if (rc) {
- obd_disconnect(mdc_exp);
- CERROR("target %s register_observer error %d\n",
- tgt->ltd_uuid.uuid, rc);
- RETURN(rc);
- }
+ rc = obd_register_observer(mdc_obd, obd);
+ if (rc) {
+ obd_disconnect(mdc_exp);
+ CERROR("target %s register_observer error %d\n",
+ tgt->ltd_uuid.uuid, rc);
+ RETURN(rc);
+ }
if (obd->obd_observer) {
/*
md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize);
+ rc = lqos_add_tgt(&lmv->lmv_qos, tgt);
+ if (rc) {
+ obd_disconnect(mdc_exp);
+ RETURN(rc);
+ }
+
CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
atomic_read(&obd->obd_refcount));
if (lmv->tgts[index] == NULL)
return;
+ lqos_del_tgt(&lmv->lmv_qos, lmv->tgts[index]);
+
OBD_FREE_PTR(lmv->tgts[index]);
lmv->tgts[index] = NULL;
return;
__u32 index, int gen)
{
struct obd_device *mdc_obd;
- struct lmv_obd *lmv = &obd->u.lmv;
- struct lmv_tgt_desc *tgt;
- int orig_tgt_count = 0;
- int rc = 0;
- ENTRY;
+ struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_tgt_desc *tgt;
+ int orig_tgt_count = 0;
+ int rc = 0;
+
+ ENTRY;
CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
}
mutex_init(&tgt->ltd_fid_mutex);
- tgt->ltd_idx = index;
+ tgt->ltd_index = index;
tgt->ltd_uuid = *uuidp;
tgt->ltd_active = 0;
lmv->tgts[index] = tgt;
RETURN(-EINVAL);
/* only files on same MDT can have their layouts swapped */
- if (tgt1->ltd_idx != tgt2->ltd_idx)
+ if (tgt1->ltd_index != tgt2->ltd_index)
RETURN(-EPERM);
rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
{
- struct lmv_obd *lmv = &obd->u.lmv;
+ struct lmv_obd *lmv = &obd->u.lmv;
struct lmv_desc *desc;
- int rc;
+ struct lnet_process_id lnet_id;
+ int i = 0;
+ int rc;
+
ENTRY;
if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
lmv->desc.ld_tgt_count = 0;
lmv->desc.ld_active_tgt_count = 0;
- lmv->desc.ld_qos_maxage = 60;
+ lmv->desc.ld_qos_maxage = LMV_DESC_QOS_MAXAGE_DEFAULT;
lmv->max_def_easize = 0;
lmv->max_easize = 0;
spin_lock_init(&lmv->lmv_lock);
mutex_init(&lmv->lmv_init_mutex);
+ /* Set up allocation policy (QoS and RR) */
+ INIT_LIST_HEAD(&lmv->lmv_qos.lq_svr_list);
+ init_rwsem(&lmv->lmv_qos.lq_rw_sem);
+ lmv->lmv_qos.lq_dirty = 1;
+ lmv->lmv_qos.lq_rr.lqr_dirty = 1;
+ lmv->lmv_qos.lq_reset = 1;
+ /* Default priority is toward free space balance */
+ lmv->lmv_qos.lq_prio_free = 232;
+ /* Default threshold for rr (roughly 17%) */
+ lmv->lmv_qos.lq_threshold_rr = 43;
+
+ /*
+ * initialize rr_index to lower 32bit of netid, so that client
+ * can distribute subdirs evenly from the beginning.
+ */
+ while (LNetGetId(i++, &lnet_id) != -ENOENT) {
+ if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) {
+ lmv->lmv_qos_rr_index = (u32)lnet_id.nid;
+ break;
+ }
+ }
+
rc = lmv_tunables_init(obd);
if (rc)
CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n",
tgt->ltd_statfs = *osfs;
tgt->ltd_statfs_age = ktime_get_seconds();
spin_unlock(&lmv->lmv_lock);
+ lmv->lmv_qos.lq_dirty = 1;
}
return rc;
RETURN(PTR_ERR(tgt));
if (op_data->op_flags & MF_GET_MDT_IDX) {
- op_data->op_mds = tgt->ltd_idx;
+ op_data->op_mds = tgt->ltd_index;
RETURN(0);
}
RETURN(rc);
}
-static struct lmv_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
-{
- static unsigned int rr_index;
-
- /* locate MDT round-robin is the first step */
- *mdt = rr_index % lmv->tgts_size;
- rr_index++;
-
- return lmv->tgts[*mdt];
-}
-
static struct lmv_tgt_desc *
lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
const char *name, int namelen, struct lu_fid *fid,
if (IS_ERR(tgt))
return tgt;
- *mds = tgt->ltd_idx;
+ *mds = tgt->ltd_index;
return tgt;
}
lmv_dir_space_hashed(op_data->op_default_mea1) &&
!lmv_dir_striped(lsm)) {
tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
+ if (tgt == ERR_PTR(-EAGAIN))
+ tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
/*
* only update statfs when mkdir under dir with "space" hash,
* this means the cached statfs may be stale, and current mkdir
* may not follow QoS accurately, but it's not serious, and it
* avoids periodic statfs when client doesn't mkdir under
* "space" hashed directories.
+ *
+ * TODO: after MDT support QoS object allocation, also update
+ * statfs for 'lfs mkdir -i -1 ...", currently it's done in user
+ * space.
*/
if (!IS_ERR(tgt)) {
struct obd_device *obd;
if (IS_ERR(tgt))
RETURN(PTR_ERR(tgt));
- op_data->op_mds = tgt->ltd_idx;
+ op_data->op_mds = tgt->ltd_index;
}
CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n",
RETURN(PTR_ERR(tgt));
CDEBUG(D_INODE, "ENQUEUE on "DFID" -> mds #%u\n",
- PFID(&op_data->op_fid1), tgt->ltd_idx);
+ PFID(&op_data->op_fid1), tgt->ltd_index);
rc = md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh,
extra_lock_flags);
CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
(int)op_data->op_namelen, op_data->op_name,
- PFID(&op_data->op_fid1), tgt->ltd_idx);
+ PFID(&op_data->op_fid1), tgt->ltd_index);
rc = md_getattr_name(tgt->ltd_exp, op_data, preq);
if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
RETURN(PTR_ERR(tgt));
}
- if (tgt->ltd_idx != op_tgt) {
+ if (tgt->ltd_index != op_tgt) {
CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
policy.l_inodebits.bits = bits;
rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
* Cancel UPDATE lock on child (fid1).
*/
op_data->op_flags |= MF_MDC_CANCEL_FID2;
- rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
+ rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX,
MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
if (rc != 0)
RETURN(rc);
RETURN(PTR_ERR(child_tgt));
if (!S_ISDIR(op_data->op_mode) && tp_tgt)
- rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_idx);
+ rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_index);
else
rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
if (rc)
}
/* cancel UPDATE lock of parent master object */
- rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, LCK_EX,
+ rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, LCK_EX,
MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
if (rc)
RETURN(rc);
op_data->op_fid4 = target_fid;
/* cancel UPDATE locks of target parent */
- rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+ rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX,
MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
if (rc)
RETURN(rc);
/* cancel LOOKUP lock of source if source is remote object */
if (child_tgt != sp_tgt) {
- rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx,
+ rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index,
LCK_EX, MDS_INODELOCK_LOOKUP,
MF_MDC_CANCEL_FID3);
if (rc)
}
/* cancel ELC locks of source */
- rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_idx, LCK_EX,
+ rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_index, LCK_EX,
MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
if (rc)
RETURN(rc);
op_data->op_flags |= MF_MDC_CANCEL_FID4;
/* cancel UPDATE locks of target parent */
- rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+ rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX,
MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
if (rc != 0)
RETURN(rc);
/* cancel LOOKUP lock of target on target parent */
if (tgt != tp_tgt) {
rc = lmv_early_cancel(exp, tp_tgt, op_data,
- tgt->ltd_idx, LCK_EX,
+ tgt->ltd_index, LCK_EX,
MDS_INODELOCK_LOOKUP,
MF_MDC_CANCEL_FID4);
if (rc != 0)
RETURN(PTR_ERR(src_tgt));
/* cancel ELC locks of source */
- rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_idx,
+ rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_index,
LCK_EX, MDS_INODELOCK_ELC,
MF_MDC_CANCEL_FID3);
if (rc != 0)
RETURN(PTR_ERR(sp_tgt));
/* cancel UPDATE locks of source parent */
- rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+ rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, LCK_EX,
MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
if (rc != 0)
RETURN(rc);
/* cancel LOOKUP lock of source on source parent */
if (src_tgt != sp_tgt) {
rc = lmv_early_cancel(exp, sp_tgt, op_data,
- tgt->ltd_idx, LCK_EX,
+ tgt->ltd_index, LCK_EX,
MDS_INODELOCK_LOOKUP,
MF_MDC_CANCEL_FID3);
if (rc != 0)
/* cancel LOOKUP lock of target on target parent */
if (tgt != tp_tgt) {
rc = lmv_early_cancel(exp, tp_tgt, op_data,
- tgt->ltd_idx, LCK_EX,
+ tgt->ltd_index, LCK_EX,
MDS_INODELOCK_LOOKUP,
MF_MDC_CANCEL_FID4);
if (rc != 0)
op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
if (parent_tgt != tgt)
- rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx,
+ rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index,
LCK_EX, MDS_INODELOCK_LOOKUP,
MF_MDC_CANCEL_FID3);
- rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
+ rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX,
MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
if (rc)
RETURN(rc);
CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n",
- PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+ PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
+ tgt->ltd_index);
rc = md_unlink(tgt->ltd_exp, op_data, request);
if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
--- /dev/null
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lmv/lmv_qos.c
+ *
+ * LMV QoS.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for object allocation QoS
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+
+#include <asm/div64.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
+
+#include "lmv_internal.h"
+
+static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
+{
+ struct obd_statfs *statfs = &tgt->ltd_statfs;
+
+ return statfs->os_bavail * statfs->os_bsize;
+}
+
+static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
+{
+ return tgt->ltd_statfs.os_ffree;
+}
+
+/**
+ * Calculate penalties per-tgt and per-server
+ *
+ * Re-calculate penalties when the configuration changes, active targets
+ * change and after statfs refresh (all these are reflected by lq_dirty flag).
+ * On every MDT and MDS: decay the penalty by half for every 8x the update
+ * interval that the device has been idle. That gives lots of time for the
+ * statfs information to be updated (which the penalty is only a proxy for),
+ * and avoids penalizing MDS/MDTs under light load.
+ * See lmv_qos_calc_weight() for how penalties are factored into the weight.
+ *
+ * \param[in] lmv LMV device
+ *
+ * \retval 0 on success
+ * \retval -EAGAIN the number of MDTs isn't enough or all MDT spaces are
+ * almost the same
+ */
+static int lmv_qos_calc_ppts(struct lmv_obd *lmv)
+{
+ struct lu_qos *qos = &lmv->lmv_qos;
+ struct lu_tgt_desc *tgt;
+ struct lu_svr_qos *svr;
+ __u64 ba_max, ba_min, ba;
+ __u64 ia_max, ia_min, ia;
+ __u32 num_active;
+ unsigned int i;
+ int prio_wide;
+ time64_t now, age;
+ __u32 maxage = lmv->desc.ld_qos_maxage;
+ int rc;
+
+ ENTRY;
+
+ if (!qos->lq_dirty)
+ GOTO(out, rc = 0);
+
+ num_active = lmv->desc.ld_active_tgt_count;
+ if (num_active < 2)
+ GOTO(out, rc = -EAGAIN);
+
+ /* find bavail on each server */
+ list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+ svr->lsq_bavail = 0;
+ svr->lsq_iavail = 0;
+ }
+ qos->lq_active_svr_count = 0;
+
+ /*
+ * How badly user wants to select targets "widely" (not recently chosen
+ * and not on recent MDS's). As opposed to "freely" (free space avail.)
+ * 0-256
+ */
+ prio_wide = 256 - qos->lq_prio_free;
+
+ ba_min = (__u64)(-1);
+ ba_max = 0;
+ ia_min = (__u64)(-1);
+ ia_max = 0;
+ now = ktime_get_real_seconds();
+
+ /* Calculate server penalty per object */
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+ if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+ continue;
+
+ /* bavail >> 16 to avoid overflow */
+ ba = tgt_statfs_bavail(tgt) >> 16;
+ if (!ba)
+ continue;
+
+ ba_min = min(ba, ba_min);
+ ba_max = max(ba, ba_max);
+
+ /* iavail >> 8 to avoid overflow */
+ ia = tgt_statfs_iavail(tgt) >> 8;
+ if (!ia)
+ continue;
+
+ ia_min = min(ia, ia_min);
+ ia_max = max(ia, ia_max);
+
+ /* Count the number of usable MDS's */
+ if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
+ qos->lq_active_svr_count++;
+ tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
+ tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
+
+ /*
+ * per-MDT penalty is
+ * prio * bavail * iavail / (num_tgt - 1) / 2
+ */
+ tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
+ do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active - 1);
+ tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
+
+ age = (now - tgt->ltd_qos.ltq_used) >> 3;
+ if (qos->lq_reset || age > 32 * maxage)
+ tgt->ltd_qos.ltq_penalty = 0;
+ else if (age > maxage)
+ /* Decay tgt penalty. */
+ tgt->ltd_qos.ltq_penalty >>= (age / maxage);
+ }
+
+ num_active = qos->lq_active_svr_count;
+ if (num_active < 2) {
+ /*
+ * If there's only 1 MDS, we can't penalize it, so instead
+ * we have to double the MDT penalty
+ */
+ num_active = 2;
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+ if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+ continue;
+
+ tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
+ }
+ }
+
+ /*
+ * Per-MDS penalty is
+ * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
+ */
+ list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+ ba = svr->lsq_bavail;
+ ia = svr->lsq_iavail;
+ svr->lsq_penalty_per_obj = prio_wide * ba * ia;
+ do_div(ba, svr->lsq_tgt_count * (num_active - 1));
+ svr->lsq_penalty_per_obj >>= 1;
+
+ age = (now - svr->lsq_used) >> 3;
+ if (qos->lq_reset || age > 32 * maxage)
+ svr->lsq_penalty = 0;
+ else if (age > maxage)
+ /* Decay server penalty. */
+ svr->lsq_penalty >>= age / maxage;
+ }
+
+ qos->lq_dirty = 0;
+ qos->lq_reset = 0;
+
+ /*
+ * If each MDT has almost same free space, do rr allocation for better
+ * creation performance
+ */
+ qos->lq_same_space = 0;
+ if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
+ (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
+ qos->lq_same_space = 1;
+ /* Reset weights for the next time we enter qos mode */
+ qos->lq_reset = 1;
+ }
+ rc = 0;
+
+out:
+ if (!rc && qos->lq_same_space)
+ RETURN(-EAGAIN);
+
+ RETURN(rc);
+}
+
+static inline bool lmv_qos_is_usable(struct lmv_obd *lmv)
+{
+ if (!lmv->lmv_qos.lq_dirty && lmv->lmv_qos.lq_same_space)
+ return false;
+
+ if (lmv->desc.ld_active_tgt_count < 2)
+ return false;
+
+ return true;
+}
+
+/**
+ * Calculate weight for a given MDT.
+ *
+ * The final MDT weight is bavail >> 16 * iavail >> 8 minus the MDT and MDS
+ * penalties. See lmv_qos_calc_ppts() for how penalties are calculated.
+ *
+ * \param[in] tgt MDT target descriptor
+ */
+static void lmv_qos_calc_weight(struct lu_tgt_desc *tgt)
+{
+ struct lu_tgt_qos *ltq = &tgt->ltd_qos;
+ __u64 temp, temp2;
+
+ temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
+ temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
+ if (temp < temp2)
+ ltq->ltq_weight = 0;
+ else
+ ltq->ltq_weight = temp - temp2;
+}
+
+/**
+ * Re-calculate weights.
+ *
+ * The function is called when some target was used for a new object. In
+ * this case we should re-calculate all the weights to keep new allocations
+ * balanced well.
+ *
+ * \param[in] lmv LMV device
+ * \param[in] tgt target where a new object was placed
+ * \param[out] total_wt new total weight for the pool
+ *
+ * \retval 0
+ */
+static int lmv_qos_used(struct lmv_obd *lmv, struct lu_tgt_desc *tgt,
+ __u64 *total_wt)
+{
+ struct lu_tgt_qos *ltq;
+ struct lu_svr_qos *svr;
+ unsigned int i;
+
+ ENTRY;
+
+ ltq = &tgt->ltd_qos;
+ LASSERT(ltq);
+
+ /* Don't allocate on this device anymore, until the next alloc_qos */
+ ltq->ltq_usable = 0;
+
+ svr = ltq->ltq_svr;
+
+ /*
+ * Decay old penalty by half (we're adding max penalty, and don't
+ * want it to run away.)
+ */
+ ltq->ltq_penalty >>= 1;
+ svr->lsq_penalty >>= 1;
+
+ /* mark the MDS and MDT as recently used */
+ ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
+
+ /* Set max penalties for this MDT and MDS */
+ ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
+ lmv->desc.ld_active_tgt_count;
+ svr->lsq_penalty += svr->lsq_penalty_per_obj *
+ lmv->lmv_qos.lq_active_svr_count;
+
+ /* Decrease all MDS penalties */
+ list_for_each_entry(svr, &lmv->lmv_qos.lq_svr_list, lsq_svr_list) {
+ if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
+ svr->lsq_penalty = 0;
+ else
+ svr->lsq_penalty -= svr->lsq_penalty_per_obj;
+ }
+
+ *total_wt = 0;
+ /* Decrease all MDT penalties */
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ ltq = &lmv->tgts[i]->ltd_qos;
+ if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+ continue;
+
+ if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
+ ltq->ltq_penalty = 0;
+ else
+ ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
+
+ lmv_qos_calc_weight(lmv->tgts[i]);
+
+ /* Recalc the total weight of usable osts */
+ if (ltq->ltq_usable)
+ *total_wt += ltq->ltq_weight;
+
+ CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu"
+ " tgtppo=%llu tgtp=%llu svrppo=%llu"
+ " svrp=%llu wt=%llu\n",
+ i, ltq->ltq_usable,
+ tgt_statfs_bavail(tgt) >> 10,
+ ltq->ltq_penalty_per_obj >> 10,
+ ltq->ltq_penalty >> 10,
+ ltq->ltq_svr->lsq_penalty_per_obj >> 10,
+ ltq->ltq_svr->lsq_penalty >> 10,
+ ltq->ltq_weight >> 10);
+ }
+
+ RETURN(0);
+}
+
+struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
+{
+ struct lu_tgt_desc *tgt;
+ __u64 total_weight = 0;
+ __u64 cur_weight = 0;
+ __u64 rand;
+ int i;
+ int rc;
+
+ ENTRY;
+
+ if (!lmv_qos_is_usable(lmv))
+ RETURN(ERR_PTR(-EAGAIN));
+
+ down_write(&lmv->lmv_qos.lq_rw_sem);
+
+ if (!lmv_qos_is_usable(lmv))
+ GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+
+ rc = lmv_qos_calc_ppts(lmv);
+ if (rc)
+ GOTO(unlock, tgt = ERR_PTR(rc));
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+ if (!tgt)
+ continue;
+
+ tgt->ltd_qos.ltq_usable = 0;
+ if (!tgt->ltd_exp || !tgt->ltd_active)
+ continue;
+
+ tgt->ltd_qos.ltq_usable = 1;
+ lmv_qos_calc_weight(tgt);
+ total_weight += tgt->ltd_qos.ltq_weight;
+ }
+
+ if (total_weight) {
+#if BITS_PER_LONG == 32
+ rand = cfs_rand() % (unsigned int)total_weight;
+ /*
+ * If total_weight > 32-bit, first generate the high
+ * 32 bits of the random number, then add in the low
+ * 32 bits (truncated to the upper limit, if needed)
+ */
+ if (total_weight > 0xffffffffULL)
+ rand = (__u64)(cfs_rand() %
+ (unsigned int)(total_weight >> 32)) << 32;
+ else
+ rand = 0;
+
+ if (rand == (total_weight & 0xffffffff00000000ULL))
+ rand |= cfs_rand() % (unsigned int)total_weight;
+ else
+ rand |= cfs_rand();
+
+#else
+ rand = ((__u64)cfs_rand() << 32 | cfs_rand()) % total_weight;
+#endif
+ } else {
+ rand = 0;
+ }
+
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[i];
+
+ if (!tgt || !tgt->ltd_qos.ltq_usable)
+ continue;
+
+ cur_weight += tgt->ltd_qos.ltq_weight;
+ if (cur_weight < rand)
+ continue;
+
+ *mdt = tgt->ltd_index;
+ lmv_qos_used(lmv, tgt, &total_weight);
+ GOTO(unlock, rc = 0);
+ }
+
+ /* no proper target found */
+ GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+unlock:
+ up_write(&lmv->lmv_qos.lq_rw_sem);
+
+ return tgt;
+}
+
+struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
+{
+ struct lu_tgt_desc *tgt;
+ int i;
+
+ ENTRY;
+
+ spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+ for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+ tgt = lmv->tgts[(i + lmv->lmv_qos_rr_index) %
+ lmv->desc.ld_tgt_count];
+ if (tgt && tgt->ltd_exp && tgt->ltd_active) {
+ *mdt = tgt->ltd_index;
+ lmv->lmv_qos_rr_index =
+ (i + lmv->lmv_qos_rr_index + 1) %
+ lmv->desc.ld_tgt_count;
+ spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+
+ RETURN(tgt);
+ }
+ }
+ spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+
+ RETURN(ERR_PTR(-ENODEV));
+}
}
LUSTRE_RO_ATTR(desc_uuid);
+static ssize_t qos_maxage_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct obd_device *dev = container_of(kobj, struct obd_device,
+ obd_kset.kobj);
+
+ return sprintf(buf, "%u\n", dev->u.lmv.desc.ld_qos_maxage);
+}
+
+static ssize_t qos_maxage_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer,
+ size_t count)
+{
+ struct obd_device *dev = container_of(kobj, struct obd_device,
+ obd_kset.kobj);
+ unsigned int val;
+ int rc;
+
+ rc = kstrtouint(buffer, 0, &val);
+ if (rc)
+ return rc;
+
+ dev->u.lmv.desc.ld_qos_maxage = val;
+
+ return count;
+}
+LUSTRE_RW_ATTR(qos_maxage);
+
+static ssize_t qos_prio_free_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct obd_device *dev = container_of(kobj, struct obd_device,
+ obd_kset.kobj);
+
+ return sprintf(buf, "%u%%\n",
+ (dev->u.lmv.lmv_qos.lq_prio_free * 100 + 255) >> 8);
+}
+
+static ssize_t qos_prio_free_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer,
+ size_t count)
+{
+ struct obd_device *dev = container_of(kobj, struct obd_device,
+ obd_kset.kobj);
+ struct lmv_obd *lmv = &dev->u.lmv;
+ unsigned int val;
+ int rc;
+
+ rc = kstrtouint(buffer, 0, &val);
+ if (rc)
+ return rc;
+
+ if (val > 100)
+ return -EINVAL;
+
+ lmv->lmv_qos.lq_prio_free = (val << 8) / 100;
+ lmv->lmv_qos.lq_dirty = 1;
+ lmv->lmv_qos.lq_reset = 1;
+
+ return count;
+}
+LUSTRE_RW_ATTR(qos_prio_free);
+
+static ssize_t qos_threshold_rr_show(struct kobject *kobj,
+ struct attribute *attr,
+ char *buf)
+{
+ struct obd_device *dev = container_of(kobj, struct obd_device,
+ obd_kset.kobj);
+
+ return sprintf(buf, "%u%%\n",
+ (dev->u.lmv.lmv_qos.lq_threshold_rr * 100 + 255) >> 8);
+}
+
+static ssize_t qos_threshold_rr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buffer,
+ size_t count)
+{
+ struct obd_device *dev = container_of(kobj, struct obd_device,
+ obd_kset.kobj);
+ struct lmv_obd *lmv = &dev->u.lmv;
+ unsigned int val;
+ int rc;
+
+ rc = kstrtouint(buffer, 0, &val);
+ if (rc)
+ return rc;
+
+ if (val > 100)
+ return -EINVAL;
+
+ lmv->lmv_qos.lq_threshold_rr = (val << 8) / 100;
+ lmv->lmv_qos.lq_dirty = 1;
+
+ return count;
+}
+LUSTRE_RW_ATTR(qos_threshold_rr);
+
#ifdef CONFIG_PROC_FS
static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
{
return 0;
seq_printf(p, "%u: %s %sACTIVE\n",
- tgt->ltd_idx, tgt->ltd_uuid.uuid,
+ tgt->ltd_index, tgt->ltd_uuid.uuid,
tgt->ltd_active ? "" : "IN");
return 0;
}
&lustre_attr_activeobd.attr,
&lustre_attr_desc_uuid.attr,
&lustre_attr_numobd.attr,
+ &lustre_attr_qos_maxage.attr,
+ &lustre_attr_qos_prio_free.attr,
+ &lustre_attr_qos_threshold_rr.attr,
NULL,
};
#define LMVEA_DELETE_VALUES(count, offset) \
((count) == 0 && (offset) == (typeof(offset))(-1))
-struct lod_qos_rr {
- spinlock_t lqr_alloc; /* protect allocation index */
- __u32 lqr_start_idx; /* start index of new inode */
- __u32 lqr_offset_idx;/* aliasing for start_idx */
- int lqr_start_count;/* reseed counter */
- struct ost_pool lqr_pool; /* round-robin optimized list */
- unsigned long lqr_dirty:1; /* recalc round-robin list */
-};
-
struct pool_desc {
char pool_name[LOV_MAXPOOLNAME + 1];
struct ost_pool pool_obds; /* pool members */
atomic_t pool_refcount;
- struct lod_qos_rr pool_rr;
+ struct lu_qos_rr pool_rr;
struct hlist_node pool_hash; /* access by poolname */
struct list_head pool_list;
struct proc_dir_entry *pool_proc_entry;
#define pool_tgt_array(p) ((p)->pool_obds.op_array)
#define pool_tgt_rw_sem(p) ((p)->pool_obds.op_rw_sem)
-struct lod_qos {
- struct list_head lq_oss_list;
- struct rw_semaphore lq_rw_sem;
- __u32 lq_active_oss_count;
- unsigned int lq_prio_free; /* priority for free space */
- unsigned int lq_threshold_rr;/* priority for rr */
- struct lod_qos_rr lq_rr; /* round robin qos data */
- bool lq_dirty:1, /* recalc qos data */
- lq_same_space:1,/* the ost's all have approx.
- the same space avail */
- lq_reset:1; /* zero current penalties */
-};
-
-struct lod_qos_oss {
- struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */
- struct list_head lqo_oss_list; /* link to lov_qos */
- __u64 lqo_bavail; /* total bytes avail on OSS */
- __u64 lqo_penalty; /* current penalty */
- __u64 lqo_penalty_per_obj; /* penalty decrease
- every obj*/
- time64_t lqo_used; /* last used time, seconds */
- __u32 lqo_ost_count; /* number of osts on this oss */
- __u32 lqo_id; /* unique oss id */
-};
-
-struct ltd_qos {
- struct lod_qos_oss *ltq_oss; /* oss info */
- __u64 ltq_penalty; /* current penalty */
- __u64 ltq_penalty_per_obj; /* penalty decrease
- every obj*/
- __u64 ltq_weight; /* net weighting */
- time64_t ltq_used; /* last used time, seconds */
- bool ltq_usable:1; /* usable for striping */
-};
-
-struct lod_tgt_desc {
- struct dt_device *ltd_tgt;
- struct list_head ltd_kill;
- struct obd_export *ltd_exp;
- struct obd_uuid ltd_uuid;
- __u32 ltd_gen;
- __u32 ltd_index;
- struct ltd_qos ltd_qos; /* qos info per target */
- struct obd_statfs ltd_statfs;
- struct ptlrpc_thread *ltd_recovery_thread;
- unsigned long ltd_active:1,/* is this target up for requests */
- ltd_activate:1,/* should target be activated */
- ltd_reap:1, /* should this target be deleted */
- ltd_got_update_log:1, /* Already got update log */
- ltd_connecting:1; /* target is connecting */
-};
+#define lod_tgt_desc lu_tgt_desc
#define TGT_PTRS 256 /* number of pointers at 1st level */
#define TGT_PTRS_PER_BLOCK 256 /* number of pointers at 2nd level */
* structure should be moved to lod_tgt_descs as well.
*/
/* QoS info per LOD */
- struct lod_qos lod_qos; /* qos info per lod */
+ struct lu_qos lod_qos; /* qos info per lod */
/* OST pool data */
struct ost_pool lod_pool_info; /* all OSTs in a packed array */
#define lod_ostnr lod_ost_descs.ltd_tgtnr
#define lod_osts_size lod_ost_descs.ltd_tgts_size
#define ltd_ost ltd_tgt
-#define lod_ost_desc lod_tgt_desc
+#define lod_ost_desc lu_tgt_desc
#define lod_mdts lod_mdt_descs.ltd_tgts
#define lod_mdt_bitmap lod_mdt_descs.ltd_tgt_bitmap
#define lod_remote_mdt_count lod_mdt_descs.ltd_tgtnr
#define lod_mdts_size lod_mdt_descs.ltd_tgts_size
#define ltd_mdt ltd_tgt
-#define lod_mdt_desc lod_tgt_desc
+#define lod_mdt_desc lu_tgt_desc
struct lod_layout_component {
struct lu_extent llc_extent;
int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
struct lu_attr *attr, const struct lu_buf *buf,
struct thandle *th);
-int qos_add_tgt(struct lod_device*, struct lod_tgt_desc *);
-int qos_del_tgt(struct lod_device *, struct lod_tgt_desc *);
-void lod_qos_rr_init(struct lod_qos_rr *lqr);
+void lod_qos_rr_init(struct lu_qos_rr *lqr);
int lod_use_defined_striping(const struct lu_env *, struct lod_object *,
const struct lu_buf *);
int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
list_del(&tgt_desc->ltd_kill);
if (ltd == &lod->lod_ost_descs) {
/* remove from QoS structures */
- rc = qos_del_tgt(lod, tgt_desc);
+ rc = lqos_del_tgt(&lod->lod_qos, tgt_desc);
if (rc)
CERROR("%s: qos_del_tgt(%s) failed:"
"rc = %d\n",
GOTO(out_mutex, rc);
}
- rc = qos_add_tgt(lod, tgt_desc);
+ rc = lqos_add_tgt(&lod->lod_qos, tgt_desc);
if (rc) {
CERROR("%s: qos_add_tgt failed with %d\n",
obd->obd_name, rc);
lod->lod_sp_me = LUSTRE_SP_CLI;
/* Set up allocation policy (QoS and RR) */
- INIT_LIST_HEAD(&lod->lod_qos.lq_oss_list);
+ INIT_LIST_HEAD(&lod->lod_qos.lq_svr_list);
init_rwsem(&lod->lod_qos.lq_rw_sem);
lod->lod_qos.lq_dirty = 1;
lod->lod_qos.lq_rr.lqr_dirty = 1;
OST_TGT(lod,i)->ltd_statfs.os_bsize)
/**
- * Add a new target to Quality of Service (QoS) target table.
- *
- * Add a new OST target to the structure representing an OSS. Resort the list
- * of known OSSs by the number of OSTs attached to each OSS. The OSS list is
- * protected internally and no external locking is required.
- *
- * \param[in] lod LOD device
- * \param[in] ost_desc OST description
- *
- * \retval 0 on success
- * \retval -ENOMEM on error
- */
-int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
-{
- struct lod_qos_oss *oss = NULL, *temposs;
- struct obd_export *exp = ost_desc->ltd_exp;
- int rc = 0, found = 0;
- struct list_head *list;
- __u32 id = 0;
- ENTRY;
-
- down_write(&lod->lod_qos.lq_rw_sem);
- /*
- * a bit hacky approach to learn NID of corresponding connection
- * but there is no official API to access information like this
- * with OSD API.
- */
- list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list) {
- if (obd_uuid_equals(&oss->lqo_uuid,
- &exp->exp_connection->c_remote_uuid)) {
- found++;
- break;
- }
- if (oss->lqo_id > id)
- id = oss->lqo_id;
- }
-
- if (!found) {
- OBD_ALLOC_PTR(oss);
- if (!oss)
- GOTO(out, rc = -ENOMEM);
- memcpy(&oss->lqo_uuid, &exp->exp_connection->c_remote_uuid,
- sizeof(oss->lqo_uuid));
- ++id;
- oss->lqo_id = id;
- } else {
- /* Assume we have to move this one */
- list_del(&oss->lqo_oss_list);
- }
-
- oss->lqo_ost_count++;
- ost_desc->ltd_qos.ltq_oss = oss;
-
- CDEBUG(D_QOS, "add tgt %s to OSS %s (%d OSTs)\n",
- obd_uuid2str(&ost_desc->ltd_uuid), obd_uuid2str(&oss->lqo_uuid),
- oss->lqo_ost_count);
-
- /* Add sorted by # of OSTs. Find the first entry that we're
- bigger than... */
- list = &lod->lod_qos.lq_oss_list;
- list_for_each_entry(temposs, list, lqo_oss_list) {
- if (oss->lqo_ost_count > temposs->lqo_ost_count)
- break;
- }
- /* ...and add before it. If we're the first or smallest, temposs
- points to the list head, and we add to the end. */
- list_add_tail(&oss->lqo_oss_list, &temposs->lqo_oss_list);
-
- lod->lod_qos.lq_dirty = 1;
- lod->lod_qos.lq_rr.lqr_dirty = 1;
-
-out:
- up_write(&lod->lod_qos.lq_rw_sem);
- RETURN(rc);
-}
-
-/**
- * Remove OST target from QoS table.
- *
- * Removes given OST target from QoS table and releases related OSS structure
- * if no OSTs remain on the OSS.
- *
- * \param[in] lod LOD device
- * \param[in] ost_desc OST description
- *
- * \retval 0 on success
- * \retval -ENOENT if no OSS was found
- */
-int qos_del_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
-{
- struct lod_qos_oss *oss;
- int rc = 0;
- ENTRY;
-
- down_write(&lod->lod_qos.lq_rw_sem);
- oss = ost_desc->ltd_qos.ltq_oss;
- if (!oss)
- GOTO(out, rc = -ENOENT);
-
- oss->lqo_ost_count--;
- if (oss->lqo_ost_count == 0) {
- CDEBUG(D_QOS, "removing OSS %s\n",
- obd_uuid2str(&oss->lqo_uuid));
- list_del(&oss->lqo_oss_list);
- ost_desc->ltd_qos.ltq_oss = NULL;
- OBD_FREE_PTR(oss);
- }
-
- lod->lod_qos.lq_dirty = 1;
- lod->lod_qos.lq_rr.lqr_dirty = 1;
-out:
- up_write(&lod->lod_qos.lq_rw_sem);
- RETURN(rc);
-}
-
-/**
* Check whether the target is available for new OST objects.
*
* Request statfs data from the given target and verify it's active and not
*/
static int lod_qos_calc_ppo(struct lod_device *lod)
{
- struct lod_qos_oss *oss;
- __u64 ba_max, ba_min, temp;
- __u32 num_active;
- unsigned int i;
- int rc, prio_wide;
- time64_t now, age;
+ struct lu_svr_qos *oss;
+ __u64 ba_max, ba_min, temp;
+ __u32 num_active;
+ unsigned int i;
+ int rc, prio_wide;
+ time64_t now, age;
+
ENTRY;
if (!lod->lod_qos.lq_dirty)
GOTO(out, rc = -EAGAIN);
/* find bavail on each OSS */
- list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list)
- oss->lqo_bavail = 0;
- lod->lod_qos.lq_active_oss_count = 0;
+ list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list)
+ oss->lsq_bavail = 0;
+ lod->lod_qos.lq_active_svr_count = 0;
/*
* How badly user wants to select OSTs "widely" (not recently chosen
ba_max = max(temp, ba_max);
/* Count the number of usable OSS's */
- if (OST_TGT(lod,i)->ltd_qos.ltq_oss->lqo_bavail == 0)
- lod->lod_qos.lq_active_oss_count++;
- OST_TGT(lod,i)->ltd_qos.ltq_oss->lqo_bavail += temp;
+ if (OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_bavail == 0)
+ lod->lod_qos.lq_active_svr_count++;
+ OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_bavail += temp;
/* per-OST penalty is prio * TGT_bavail / (num_ost - 1) / 2 */
temp >>= 1;
(age / lod->lod_desc.ld_qos_maxage);
}
- num_active = lod->lod_qos.lq_active_oss_count - 1;
+ num_active = lod->lod_qos.lq_active_svr_count - 1;
if (num_active < 1) {
/* If there's only 1 OSS, we can't penalize it, so instead
we have to double the OST penalty */
}
/* Per-OSS penalty is prio * oss_avail / oss_osts / (num_oss - 1) / 2 */
- list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list) {
- temp = oss->lqo_bavail >> 1;
- do_div(temp, oss->lqo_ost_count * num_active);
- oss->lqo_penalty_per_obj = (temp * prio_wide) >> 8;
+ list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) {
+ temp = oss->lsq_bavail >> 1;
+ do_div(temp, oss->lsq_tgt_count * num_active);
+ oss->lsq_penalty_per_obj = (temp * prio_wide) >> 8;
- age = (now - oss->lqo_used) >> 3;
+ age = (now - oss->lsq_used) >> 3;
if (lod->lod_qos.lq_reset ||
age > 32 * lod->lod_desc.ld_qos_maxage)
- oss->lqo_penalty = 0;
+ oss->lsq_penalty = 0;
else if (age > lod->lod_desc.ld_qos_maxage)
/* Decay OSS penalty. */
- oss->lqo_penalty >>= age / lod->lod_desc.ld_qos_maxage;
+ oss->lsq_penalty >>= age / lod->lod_desc.ld_qos_maxage;
}
lod->lod_qos.lq_dirty = 0;
__u64 temp, temp2;
temp = TGT_BAVAIL(i);
- temp2 = OST_TGT(lod,i)->ltd_qos.ltq_penalty +
- OST_TGT(lod,i)->ltd_qos.ltq_oss->lqo_penalty;
+ temp2 = OST_TGT(lod, i)->ltd_qos.ltq_penalty +
+ OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_penalty;
if (temp < temp2)
- OST_TGT(lod,i)->ltd_qos.ltq_weight = 0;
+ OST_TGT(lod, i)->ltd_qos.ltq_weight = 0;
else
- OST_TGT(lod,i)->ltd_qos.ltq_weight = temp - temp2;
+ OST_TGT(lod, i)->ltd_qos.ltq_weight = temp - temp2;
return 0;
}
__u32 index, __u64 *total_wt)
{
struct lod_tgt_desc *ost;
- struct lod_qos_oss *oss;
+ struct lu_svr_qos *oss;
unsigned int j;
ENTRY;
/* Don't allocate on this devuce anymore, until the next alloc_qos */
ost->ltd_qos.ltq_usable = 0;
- oss = ost->ltd_qos.ltq_oss;
+ oss = ost->ltd_qos.ltq_svr;
/* Decay old penalty by half (we're adding max penalty, and don't
want it to run away.) */
ost->ltd_qos.ltq_penalty >>= 1;
- oss->lqo_penalty >>= 1;
+ oss->lsq_penalty >>= 1;
/* mark the OSS and OST as recently used */
- ost->ltd_qos.ltq_used = oss->lqo_used = ktime_get_real_seconds();
+ ost->ltd_qos.ltq_used = oss->lsq_used = ktime_get_real_seconds();
/* Set max penalties for this OST and OSS */
ost->ltd_qos.ltq_penalty +=
ost->ltd_qos.ltq_penalty_per_obj * lod->lod_ostnr;
- oss->lqo_penalty += oss->lqo_penalty_per_obj *
- lod->lod_qos.lq_active_oss_count;
+ oss->lsq_penalty += oss->lsq_penalty_per_obj *
+ lod->lod_qos.lq_active_svr_count;
/* Decrease all OSS penalties */
- list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list) {
- if (oss->lqo_penalty < oss->lqo_penalty_per_obj)
- oss->lqo_penalty = 0;
+ list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) {
+ if (oss->lsq_penalty < oss->lsq_penalty_per_obj)
+ oss->lsq_penalty = 0;
else
- oss->lqo_penalty -= oss->lqo_penalty_per_obj;
+ oss->lsq_penalty -= oss->lsq_penalty_per_obj;
}
*total_wt = 0;
i, ost->ltd_qos.ltq_usable, TGT_BAVAIL(i) >> 10,
ost->ltd_qos.ltq_penalty_per_obj >> 10,
ost->ltd_qos.ltq_penalty >> 10,
- ost->ltd_qos.ltq_oss->lqo_penalty_per_obj >> 10,
- ost->ltd_qos.ltq_oss->lqo_penalty >> 10,
+ ost->ltd_qos.ltq_svr->lsq_penalty_per_obj >> 10,
+ ost->ltd_qos.ltq_svr->lsq_penalty >> 10,
ost->ltd_qos.ltq_weight >> 10);
}
RETURN(0);
}
-void lod_qos_rr_init(struct lod_qos_rr *lqr)
+void lod_qos_rr_init(struct lu_qos_rr *lqr)
{
spin_lock_init(&lqr->lqr_alloc);
lqr->lqr_dirty = 1;
}
-
#define LOV_QOS_EMPTY ((__u32)-1)
/**
* \retval -ENOMEM fails to allocate the array
*/
static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool,
- struct lod_qos_rr *lqr)
+ struct lu_qos_rr *lqr)
{
- struct lod_qos_oss *oss;
+ struct lu_svr_qos *oss;
struct lod_tgt_desc *ost;
unsigned placed, real_count;
unsigned int i;
/* Place all the OSTs from 1 OSS at the same time. */
placed = 0;
- list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list) {
+ list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) {
int j = 0;
for (i = 0; i < lqr->lqr_pool.op_count; i++) {
ost = OST_TGT(lod,src_pool->op_array[i]);
LASSERT(ost && ost->ltd_ost);
- if (ost->ltd_qos.ltq_oss != oss)
+ if (ost->ltd_qos.ltq_svr != oss)
continue;
/* Evenly space these OSTs across arrayspace */
- next = j * lqr->lqr_pool.op_count / oss->lqo_ost_count;
+ next = j * lqr->lqr_pool.op_count / oss->lsq_tgt_count;
while (lqr->lqr_pool.op_array[next] != LOV_QOS_EMPTY)
next = (next + 1) % lqr->lqr_pool.op_count;
{
struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
struct lod_tgt_desc *ost = OST_TGT(lod, index);
- struct lod_qos_oss *lqo = ost->ltd_qos.ltq_oss;
+ struct lu_svr_qos *lsq = ost->ltd_qos.ltq_svr;
bool used = false;
int i;
/* check OSS use */
for (i = 0; i < lag->lag_oaa_count; i++) {
- if (lag->lag_oss_avoid_array[i] == lqo->lqo_id) {
+ if (lag->lag_oss_avoid_array[i] == lsq->lsq_id) {
used = true;
break;
}
struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
struct pool_desc *pool = NULL;
struct ost_pool *osts;
- struct lod_qos_rr *lqr;
+ struct lu_qos_rr *lqr;
unsigned int i, array_idx;
__u32 ost_start_idx_temp;
__u32 stripe_idx = 0;
*/
for (j = 0; j < comp->llc_stripe_count; j++) {
struct lod_tgt_desc *ost;
- struct lod_qos_oss *lqo;
+ struct lu_svr_qos *lsq;
int k;
ost = OST_TGT(lod, comp->llc_ost_indices[j]);
- lqo = ost->ltd_qos.ltq_oss;
+ lsq = ost->ltd_qos.ltq_svr;
if (cfs_bitmap_check(bitmap, ost->ltd_index))
continue;
for (k = 0; k < lag->lag_oaa_count; k++) {
if (lag->lag_oss_avoid_array[k] ==
- lqo->lqo_id)
+ lsq->lsq_id)
break;
}
if (k == lag->lag_oaa_count) {
lag->lag_oss_avoid_array[k] =
- lqo->lqo_id;
+ lsq->lsq_id;
lag->lag_oaa_count++;
}
}
obdclass-all-objs += linkea.o
obdclass-all-objs += kernelcomm.o jobid.o
obdclass-all-objs += integrity.o obd_cksum.o
+obdclass-all-objs += lu_qos.o
@SERVER_TRUE@obdclass-all-objs += acl.o
@SERVER_TRUE@obdclass-all-objs += idmap.o
--- /dev/null
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lu_qos.c
+ *
+ * Lustre QoS.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for object allocation QoS
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_hash.h> /* hash_long() */
+#include <libcfs/linux/linux-mem.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+
+/**
+ * Add a new target to Quality of Service (QoS) target table.
+ *
+ * Add a new MDT/OST target to the structure representing an OSS. Resort the
+ * list of known MDSs/OSSs by the number of MDTs/OSTs attached to each MDS/OSS.
+ * The MDS/OSS list is protected internally and no external locking is required.
+ *
+ * \param[in] qos lu_qos data
+ * \param[in] ltd target description
+ *
+ * \retval 0 on success
+ * \retval -ENOMEM on error
+ */
+int lqos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd)
+{
+ struct lu_svr_qos *svr = NULL;
+ struct lu_svr_qos *tempsvr;
+ struct obd_export *exp = ltd->ltd_exp;
+ int found = 0;
+ __u32 id = 0;
+ int rc = 0;
+
+ ENTRY;
+
+ down_write(&qos->lq_rw_sem);
+ /*
+ * a bit hacky approach to learn NID of corresponding connection
+ * but there is no official API to access information like this
+ * with OSD API.
+ */
+ list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+ if (obd_uuid_equals(&svr->lsq_uuid,
+ &exp->exp_connection->c_remote_uuid)) {
+ found++;
+ break;
+ }
+ if (svr->lsq_id > id)
+ id = svr->lsq_id;
+ }
+
+ if (!found) {
+ OBD_ALLOC_PTR(svr);
+ if (!svr)
+ GOTO(out, rc = -ENOMEM);
+ memcpy(&svr->lsq_uuid, &exp->exp_connection->c_remote_uuid,
+ sizeof(svr->lsq_uuid));
+ ++id;
+ svr->lsq_id = id;
+ } else {
+ /* Assume we have to move this one */
+ list_del(&svr->lsq_svr_list);
+ }
+
+ svr->lsq_tgt_count++;
+ ltd->ltd_qos.ltq_svr = svr;
+
+ CDEBUG(D_OTHER, "add tgt %s to server %s (%d targets)\n",
+ obd_uuid2str(<d->ltd_uuid), obd_uuid2str(&svr->lsq_uuid),
+ svr->lsq_tgt_count);
+
+ /*
+ * Add sorted by # of tgts. Find the first entry that we're
+ * bigger than...
+ */
+ list_for_each_entry(tempsvr, &qos->lq_svr_list, lsq_svr_list) {
+ if (svr->lsq_tgt_count > tempsvr->lsq_tgt_count)
+ break;
+ }
+ /*
+ * ...and add before it. If we're the first or smallest, tempsvr
+ * points to the list head, and we add to the end.
+ */
+ list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list);
+
+ qos->lq_dirty = 1;
+ qos->lq_rr.lqr_dirty = 1;
+
+out:
+ up_write(&qos->lq_rw_sem);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(lqos_add_tgt);
+
+/**
+ * Remove MDT/OST target from QoS table.
+ *
+ * Removes given MDT/OST target from QoS table and releases related
+ * MDS/OSS structure if no target remain on the MDS/OSS.
+ *
+ * \param[in] qos lu_qos data
+ * \param[in] ltd target description
+ *
+ * \retval 0 on success
+ * \retval -ENOENT if no server was found
+ */
+int lqos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd)
+{
+ struct lu_svr_qos *svr;
+ int rc = 0;
+
+ ENTRY;
+
+ down_write(&qos->lq_rw_sem);
+ svr = ltd->ltd_qos.ltq_svr;
+ if (!svr)
+ GOTO(out, rc = -ENOENT);
+
+ svr->lsq_tgt_count--;
+ if (svr->lsq_tgt_count == 0) {
+ CDEBUG(D_OTHER, "removing server %s\n",
+ obd_uuid2str(&svr->lsq_uuid));
+ list_del(&svr->lsq_svr_list);
+ ltd->ltd_qos.ltq_svr = NULL;
+ OBD_FREE_PTR(svr);
+ }
+
+ qos->lq_dirty = 1;
+ qos->lq_rr.lqr_dirty = 1;
+out:
+ up_write(&qos->lq_rw_sem);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(lqos_del_tgt);
}
run_test 412 "mkdir on specific MDTs"
-test_413() {
+test_413a() {
[ $MDSCOUNT -lt 2 ] &&
skip "We need at least 2 MDTs for this test"
error "don't expect $max"
done
}
-run_test 413 "mkdir on less full MDTs"
+run_test 413a "mkdir on less full MDTs"
+
+test_413b() {
+ [ $MDSCOUNT -lt 2 ] &&
+ skip "We need at least 2 MDTs for this test"
+
+ [ $MDS1_VERSION -lt $(version_code 2.12.52) ] &&
+ skip "Need server version at least 2.12.52"
+
+ mkdir $DIR/$tdir || error "mkdir failed"
+ $LFS setdirstripe -D -i -1 -H space $DIR/$tdir ||
+ error "setdirstripe failed"
+
+ local qos_prio_free
+ local qos_threshold_rr
+ local count
+
+ qos_prio_free=$($LCTL get_param -n lmv.*.qos_prio_free | head -n1)
+ qos_prio_free=${qos_prio_free%%%}
+ qos_threshold_rr=$($LCTL get_param -n lmv.*.qos_threshold_rr | head -n1)
+ qos_threshold_rr=${qos_threshold_rr%%%}
+
+ stack_trap "$LCTL set_param lmv.*.qos_prio_free=$qos_prio_free" EXIT
+ stack_trap "$LCTL set_param lmv.*.qos_threshold_rr=$qos_threshold_rr" \
+ EXIT
+
+ echo "mkdir with roundrobin"
+
+ $LCTL set_param lmv.*.qos_threshold_rr=100
+ for i in $(seq $((100 * MDSCOUNT))); do
+ mkdir $DIR/$tdir/subdir$i || error "mkdir subdir$i failed"
+ done
+ for i in $(seq $MDSCOUNT); do
+ count=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$((i - 1))$ |
+ wc -w)
+ echo "$count directories created on MDT$((i - 1))"
+ [ $count -eq 100 ] || error "subdirs are not evenly distributed"
+ done
+
+ rm -rf $DIR/$tdir/*
+
+ $LCTL set_param lmv.*.qos_threshold_rr=$qos_threshold_rr
+
+ local ffree
+ local max
+ local min
+ local max_index
+ local min_index
+
+ ffree=($(lctl get_param -n mdc.*[mM][dD][cC]-[^M]*.filesfree | uniq))
+ echo "MDT filesfree available: ${ffree[@]}"
+ max=${ffree[0]}
+ min=${ffree[0]}
+ max_index=0
+ min_index=0
+ for ((i = 0; i < ${#ffree[@]}; i++)); do
+ if [[ ${ffree[i]} -gt $max ]]; then
+ max=${ffree[i]}
+ max_index=$i
+ fi
+ if [[ ${ffree[i]} -lt $min ]]; then
+ min=${ffree[i]}
+ min_index=$i
+ fi
+ done
+ echo "Min free files: MDT$min_index: $min"
+ echo "Max free files: MDT$max_index: $max"
+
+ [ $min -eq 0 ] && skip "no free files in MDT$min_index"
+ [ $min -gt 10000000 ] && skip "too much free files in MDT$min_index"
+
+ # Check if we need to generate uneven MDTs
+ test_mkdir -i $min_index -c 1 -p $DIR/$tdir-MDT$min_index
+ local threshold=10
+ local diff=$((max - min))
+ local diff2=$((diff * 100 / min))
+
+ echo -n "Check for uneven MDTs: "
+ echo -n "diff=$diff files ($diff2%) must be > $threshold% ..."
+
+ if [ $diff2 -gt $threshold ]; then
+ echo "ok"
+ echo "Don't need to fill MDT$min_index"
+ else
+ # generate uneven MDTs, create till 25% diff
+ echo "no"
+ diff2=$((threshold - diff2))
+ diff=$((min * diff2 / 100))
+ # 50 sec per 10000 files in vm
+ [ $diff -gt 40000 ] && [ "$SLOW" = "no" ] &&
+ skip "$diff files to create"
+ echo "Fill $diff2% diff in MDT$min_index with $diff files"
+ local i
+ local value="$(generate_string 1024)"
+ for i in $(seq $diff); do
+ $OPENFILE -f O_CREAT:O_LOV_DELAY_CREATE \
+ $DIR/$tdir-MDT$min_index/f$i > /dev/null ||
+ error "create f$i failed"
+ setfattr -n user.413b -v $value \
+ $DIR/$tdir-MDT$min_index/f$i ||
+ error "setfattr f$i failed"
+ done
+ fi
+
+ min=$((100 *MDSCOUNT))
+ max=0
+
+ echo "mkdir with balanced space usage"
+ $LCTL set_param lmv.*.qos_prio_free=100
+ for i in $(seq $((100 * MDSCOUNT))); do
+ mkdir $DIR/$tdir/subdir$i || error "mkdir subdir$i failed"
+ done
+ for i in $(seq $MDSCOUNT); do
+ count=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$((i - 1))$ |
+ wc -w)
+ echo "$count directories created on MDT$((i - 1))"
+ [ $min -gt $count ] && min=$count
+ [ $max -lt $count ] && max=$count
+ done
+ [ $((max - min)) -gt $MDSCOUNT ] ||
+ error "subdirs shouldn't be evenly distributed"
+}
+run_test 413b "mkdir with balanced space usage"
test_414() {
#define OBD_FAIL_PTLRPC_BULK_ATTACH 0x521