From: Lai Siyao Date: Sat, 3 Aug 2019 21:00:33 +0000 (+0800) Subject: LU-12624 obdclass: lu_tgt_descs cleanup X-Git-Tag: 2.12.90~28 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=45222b2ef279d62ac3aab0e7babc55d77e3c93a2;ds=sidebyside LU-12624 obdclass: lu_tgt_descs cleanup This patch cleans up code about lu_tgt_descs, so that it's cleaner to add MDT object QoS allocation support: * rename struct ost_pool to lu_tgt_pool. * put struct lu_qos, lmv_desc/lov_desc and lu_tgt_pool into struct lu_tgt_descs because it's more natural to manage these data there and fewer arguments are needed to pass around in related functions. * remove lu_tgt_descs.ltd_tgtnr, use lu_tgt_descs.ltd_lov_desc.ld_tgt_count instead, because they are duplicate. * other cleanups. Signed-off-by: Lai Siyao Change-Id: I46f2e0ff06a8e580bac1dfda9a09a549b38d487d Reviewed-on: https://review.whamcloud.com/35824 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index 71d2209..8a9635e 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -1397,13 +1397,13 @@ static inline bool lu_object_is_cl(const struct lu_object *o) return lu_device_is_cl(o->lo_dev); } -/* Generic subset of OSTs */ -struct ost_pool { +/* Generic subset of tgts */ +struct lu_tgt_pool { __u32 *op_array; /* array of index of * lov_obd->lov_tgts */ - unsigned int op_count; /* number of OSTs in the array */ - unsigned int op_size; /* allocated size of lp_array */ - struct rw_semaphore op_rw_sem; /* to protect ost_pool use */ + unsigned int op_count; /* number of tgts in the array */ + unsigned int op_size; /* allocated size of op_array */ + struct rw_semaphore op_rw_sem; /* to protect lu_tgt_pool use */ }; /* round-robin QoS data for LOD/LMV */ @@ -1412,7 +1412,7 @@ struct lu_qos_rr { __u32 lqr_start_idx; /* start index of new inode */ __u32 lqr_offset_idx;/* aliasing for start_idx */ int lqr_start_count;/* reseed counter */ - struct ost_pool lqr_pool; /* round-robin optimized list */ + struct lu_tgt_pool lqr_pool; /* round-robin optimized list */ unsigned long lqr_dirty:1; /* recalc round-robin list */ }; @@ -1473,13 +1473,29 @@ struct lu_tgt_desc_idx { struct lu_tgt_desc *ldi_tgt[TGT_PTRS_PER_BLOCK]; }; +/* QoS data for LOD/LMV */ +struct lu_qos { + struct list_head lq_svr_list; /* lu_svr_qos list */ + struct rw_semaphore lq_rw_sem; + __u32 lq_active_svr_count; + unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_threshold_rr;/* priority for rr */ + struct lu_qos_rr lq_rr; /* round robin qos data */ + unsigned long lq_dirty:1, /* recalc qos data */ + lq_same_space:1,/* the servers all have approx. + * the same space avail */ + lq_reset:1; /* zero current penalties */ +}; + struct lu_tgt_descs { + union { + struct lov_desc ltd_lov_desc; + struct lmv_desc ltd_lmv_desc; + }; /* list of known TGTs */ struct lu_tgt_desc_idx *ltd_tgt_idx[TGT_PTRS]; /* Size of the lu_tgts array, granted to be a power of 2 */ __u32 ltd_tgts_size; - /* number of registered TGTs */ - __u32 ltd_tgtnr; /* bitmap of TGTs available */ struct cfs_bitmap *ltd_tgt_bitmap; /* TGTs scheduled to be deleted */ @@ -1490,42 +1506,31 @@ struct lu_tgt_descs { struct mutex ltd_mutex; /* read/write semaphore used for array relocation */ struct rw_semaphore ltd_rw_sem; + /* QoS */ + struct lu_qos ltd_qos; + /* all tgts in a packed array */ + struct lu_tgt_pool ltd_tgt_pool; + /* true if tgt is MDT */ + bool ltd_is_mdt; }; #define LTD_TGT(ltd, index) \ (ltd)->ltd_tgt_idx[(index) / \ TGT_PTRS_PER_BLOCK]->ldi_tgt[(index) % TGT_PTRS_PER_BLOCK] -/* QoS data for LOD/LMV */ -struct lu_qos { - struct list_head lq_svr_list; /* lu_svr_qos list */ - struct rw_semaphore lq_rw_sem; - __u32 lq_active_svr_count; - unsigned int lq_prio_free; /* priority for free space */ - unsigned int lq_threshold_rr;/* priority for rr */ - struct lu_qos_rr lq_rr; /* round robin qos data */ - unsigned long lq_dirty:1, /* recalc qos data */ - lq_same_space:1,/* the servers all have approx. - * the same space avail */ - lq_reset:1; /* zero current penalties */ -}; - -void lu_qos_rr_init(struct lu_qos_rr *lqr); -int lqos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); -int lqos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); -bool lqos_is_usable(struct lu_qos *qos, __u32 active_tgt_nr); -int lqos_calc_penalties(struct lu_qos *qos, struct lu_tgt_descs *ltd, - __u32 active_tgt_nr, __u32 maxage, bool is_mdt); -void lqos_calc_weight(struct lu_tgt_desc *tgt); -int lqos_recalc_weight(struct lu_qos *qos, struct lu_tgt_descs *ltd, - struct lu_tgt_desc *tgt, __u32 active_tgt_nr, - __u64 *total_wt); u64 lu_prandom_u64_max(u64 ep_ro); +void lu_qos_rr_init(struct lu_qos_rr *lqr); +int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd); +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt); -int lu_tgt_descs_init(struct lu_tgt_descs *ltd); +int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt); void lu_tgt_descs_fini(struct lu_tgt_descs *ltd); -int lu_tgt_descs_add(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); -void lu_tgt_descs_del(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt); +bool ltd_qos_is_usable(struct lu_tgt_descs *ltd); +int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd); +int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, + __u64 *total_wt); static inline struct lu_tgt_desc *ltd_first_tgt(struct lu_tgt_descs *ltd) { diff --git a/lustre/include/obd.h b/lustre/include/obd.h index fcd78b1..c1c1f37 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -394,7 +394,7 @@ struct lov_md_tgt_desc { struct lov_obd { struct lov_desc desc; struct lov_tgt_desc **lov_tgts; /* sparse array */ - struct ost_pool lov_packed; /* all OSTs in a packed + struct lu_tgt_pool lov_packed; /* all OSTs in a packed array */ struct mutex lov_lock; struct obd_connect_data lov_ocd; @@ -424,7 +424,6 @@ struct lov_obd { struct lmv_obd { struct lu_client_fld lmv_fld; spinlock_t lmv_lock; - struct lmv_desc desc; int connected; int max_easize; @@ -437,10 +436,12 @@ struct lmv_obd { struct kobject *lmv_tgts_kobj; void *lmv_cache; - struct lu_qos lmv_qos; __u32 lmv_qos_rr_index; }; +#define lmv_mdt_count lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count +#define lmv_qos lmv_mdt_descs.ltd_qos + /* Minimum sector size is 512 */ #define MAX_GUARD_NUMBER (PAGE_SIZE / 512) diff --git a/lustre/lmv/lmv_fld.c b/lustre/lmv/lmv_fld.c index db45c4b..f31b1b9 100644 --- a/lustre/lmv/lmv_fld.c +++ b/lustre/lmv/lmv_fld.c @@ -78,11 +78,11 @@ int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds) CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n", *mds, PFID(fid)); - if (*mds >= lmv->desc.ld_tgt_count) { + if (*mds >= lmv->lmv_mdt_descs.ltd_tgts_size) { rc = -EINVAL; CERROR("%s: FLD lookup got invalid mds #%x (max: %x) for fid="DFID": rc = %d\n", - obd->obd_name, *mds, lmv->desc.ld_tgt_count, PFID(fid), - rc); + obd->obd_name, *mds, lmv->lmv_mdt_descs.ltd_tgts_size, + PFID(fid), rc); } RETURN(rc); } diff --git a/lustre/lmv/lmv_internal.h b/lustre/lmv/lmv_internal.h index 9f48a15..a58bebd 100644 --- a/lustre/lmv/lmv_internal.h +++ b/lustre/lmv/lmv_internal.h @@ -122,7 +122,7 @@ lmv_fid2tgt_index(struct lmv_obd *lmv, const struct lu_fid *fid) u32 mdt_idx; int rc; - if (lmv->desc.ld_tgt_count < 2) + if (lmv->lmv_mdt_count < 2) return 0; rc = lmv_fld_lookup(lmv, fid, &mdt_idx); diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 8b59e35..a01505b 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -63,11 +63,12 @@ static int lmv_check_connect(struct obd_device *obd); void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt, int activate) { - if (tgt->ltd_active == activate) - return; + if (tgt->ltd_active == activate) + return; - tgt->ltd_active = activate; - lmv->desc.ld_active_tgt_count += (activate ? 1 : -1); + tgt->ltd_active = activate; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count += + (activate ? 1 : -1); tgt->ltd_exp->exp_obd->obd_inactive = !activate; } @@ -343,11 +344,11 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) tgt->ltd_active = 1; tgt->ltd_exp = mdc_exp; - lmv->desc.ld_active_tgt_count++; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count++; md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize); - rc = lqos_add_tgt(&lmv->lmv_qos, tgt); + rc = lu_qos_add_tgt(&lmv->lmv_qos, tgt); if (rc) { obd_disconnect(mdc_exp); RETURN(rc); @@ -370,8 +371,7 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) static void lmv_del_target(struct lmv_obd *lmv, struct lu_tgt_desc *tgt) { LASSERT(tgt); - lqos_del_tgt(&lmv->lmv_qos, tgt); - lu_tgt_descs_del(&lmv->lmv_mdt_descs, tgt); + ltd_del_tgt(&lmv->lmv_mdt_descs, tgt); OBD_FREE_PTR(tgt); } @@ -382,7 +382,6 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, struct lmv_obd *lmv = &obd->u.lmv; struct lmv_tgt_desc *tgt; struct lu_tgt_descs *ltd = &lmv->lmv_mdt_descs; - int orig_tgt_count = 0; int rc = 0; ENTRY; @@ -406,11 +405,7 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, tgt->ltd_active = 0; mutex_lock(<d->ltd_mutex); - rc = lu_tgt_descs_add(ltd, tgt); - if (!rc && index >= lmv->desc.ld_tgt_count) { - orig_tgt_count = lmv->desc.ld_tgt_count; - lmv->desc.ld_tgt_count = index + 1; - } + rc = ltd_add_tgt(ltd, tgt); mutex_unlock(<d->ltd_mutex); if (rc) @@ -421,14 +416,10 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, RETURN(0); rc = lmv_connect_mdc(obd, tgt); - if (rc != 0) { - mutex_lock(<d->ltd_mutex); - lmv->desc.ld_tgt_count = orig_tgt_count; - memset(tgt, 0, sizeof(*tgt)); - mutex_unlock(<d->ltd_mutex); - } else { + if (!rc) { int easize = sizeof(struct lmv_stripe_md) + - lmv->desc.ld_tgt_count * sizeof(struct lu_fid); + lmv->lmv_mdt_count * sizeof(struct lu_fid); + lmv_init_ea_size(obd->obd_self_export, easize, 0); } @@ -455,7 +446,7 @@ static int lmv_check_connect(struct obd_device *obd) if (lmv->connected) GOTO(unlock, rc = 0); - if (lmv->desc.ld_tgt_count == 0) { + if (!lmv->lmv_mdt_count) { CERROR("%s: no targets configured: rc = -EINVAL\n", obd->obd_name); GOTO(unlock, rc = -EINVAL); @@ -477,7 +468,7 @@ static int lmv_check_connect(struct obd_device *obd) } lmv->connected = 1; - easize = lmv_mds_md_size(lmv->desc.ld_tgt_count, LMV_MAGIC); + easize = lmv_mds_md_size(lmv->lmv_mdt_count, LMV_MAGIC); lmv_init_ea_size(obd->obd_self_export, easize, 0); EXIT; unlock: @@ -491,7 +482,7 @@ out_disc: if (!tgt->ltd_exp) continue; - --lmv->desc.ld_active_tgt_count; + --lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count; obd_disconnect(tgt->ltd_exp); } @@ -827,7 +818,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, struct lmv_obd *lmv = &obddev->u.lmv; struct lu_tgt_desc *tgt = NULL; int set = 0; - __u32 count = lmv->desc.ld_tgt_count; + __u32 count = lmv->lmv_mdt_count; int rc = 0; ENTRY; @@ -843,7 +834,8 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, __u32 index; memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); - if (index >= count) + + if (index >= lmv->lmv_mdt_descs.ltd_tgts_size) RETURN(-ENODEV); tgt = lmv_tgt(lmv, index); @@ -876,12 +868,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, struct obd_quotactl *oqctl; if (qctl->qc_valid == QC_MDTIDX) { - if (count <= qctl->qc_idx) - RETURN(-EINVAL); - tgt = lmv_tgt(lmv, qctl->qc_idx); - if (!tgt || !tgt->ltd_exp) - RETURN(-EINVAL); } else if (qctl->qc_valid == QC_UUID) { lmv_foreach_tgt(lmv, tgt) { if (!obd_uuid_equals(&tgt->ltd_uuid, @@ -897,10 +884,9 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, RETURN(-EINVAL); } - if (tgt->ltd_index >= count) - RETURN(-EAGAIN); + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); - LASSERT(tgt != NULL && tgt->ltd_exp != NULL); OBD_ALLOC_PTR(oqctl); if (!oqctl) RETURN(-ENOMEM); @@ -1081,7 +1067,7 @@ static u32 lmv_placement_policy(struct obd_device *obd, ENTRY; - if (lmv->desc.ld_tgt_count == 1) + if (lmv->lmv_mdt_count == 1) RETURN(0); lum = op_data->op_data; @@ -1199,27 +1185,17 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) RETURN(-EINVAL); } - obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid); - lmv->desc.ld_tgt_count = 0; - lmv->desc.ld_active_tgt_count = 0; - lmv->desc.ld_qos_maxage = LMV_DESC_QOS_MAXAGE_DEFAULT; + obd_str2uuid(&lmv->lmv_mdt_descs.ltd_lmv_desc.ld_uuid, + desc->ld_uuid.uuid); + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count = 0; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count = 0; + lmv->lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage = + LMV_DESC_QOS_MAXAGE_DEFAULT; lmv->max_def_easize = 0; lmv->max_easize = 0; spin_lock_init(&lmv->lmv_lock); - /* Set up allocation policy (QoS and RR) */ - INIT_LIST_HEAD(&lmv->lmv_qos.lq_svr_list); - init_rwsem(&lmv->lmv_qos.lq_rw_sem); - lmv->lmv_qos.lq_dirty = 1; - lmv->lmv_qos.lq_reset = 1; - /* Default priority is toward free space balance */ - lmv->lmv_qos.lq_prio_free = 232; - /* Default threshold for rr (roughly 17%) */ - lmv->lmv_qos.lq_threshold_rr = 43; - - lu_qos_rr_init(&lmv->lmv_qos.lq_rr); - /* * initialize rr_index to lower 32bit of netid, so that client * can distribute subdirs evenly from the beginning. @@ -1241,7 +1217,7 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) if (rc) CERROR("Can't init FLD, err %d\n", rc); - rc = lu_tgt_descs_init(&lmv->lmv_mdt_descs); + rc = lu_tgt_descs_init(&lmv->lmv_mdt_descs, true); if (rc) CWARN("%s: error initialize target table: rc = %d\n", obd->obd_name, rc); @@ -1304,7 +1280,7 @@ static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags) if (flags & OBD_STATFS_FOR_MDT0) return 0; - if (lmv->lmv_statfs_start || lmv->desc.ld_tgt_count == 1) + if (lmv->lmv_statfs_start || lmv->lmv_mdt_count == 1) return lmv->lmv_statfs_start; /* choose initial MDT for this client */ @@ -1317,8 +1293,8 @@ static int lmv_select_statfs_mdt(struct lmv_obd *lmv, __u32 flags) /* We dont need a full 64-bit modulus, just enough * to distribute the requests across MDTs evenly. */ - lmv->lmv_statfs_start = - (u32)lnet_id.nid % lmv->desc.ld_tgt_count; + lmv->lmv_statfs_start = (u32)lnet_id.nid % + lmv->lmv_mdt_count; break; } } @@ -1346,8 +1322,8 @@ static int lmv_statfs(const struct lu_env *env, struct obd_export *exp, /* distribute statfs among MDTs */ idx = lmv_select_statfs_mdt(lmv, flags); - for (i = 0; i < lmv->desc.ld_tgt_count; i++, idx++) { - idx = idx % lmv->desc.ld_tgt_count; + for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++, idx++) { + idx = idx % lmv->lmv_mdt_descs.ltd_tgts_size; tgt = lmv_tgt(lmv, idx); if (!tgt || !tgt->ltd_exp) continue; @@ -1423,7 +1399,7 @@ int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt) int rc; if (ktime_get_seconds() - tgt->ltd_statfs_age < - obd->u.lmv.desc.ld_qos_maxage) + obd->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage) return 0; rc = obd_statfs_async(tgt->ltd_exp, &oinfo, 0, NULL); @@ -1565,17 +1541,15 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt) ENTRY; - if (!lqos_is_usable(&lmv->lmv_qos, lmv->desc.ld_active_tgt_count)) + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) RETURN(ERR_PTR(-EAGAIN)); down_write(&lmv->lmv_qos.lq_rw_sem); - if (!lqos_is_usable(&lmv->lmv_qos, lmv->desc.ld_active_tgt_count)) + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); - rc = lqos_calc_penalties(&lmv->lmv_qos, &lmv->lmv_mdt_descs, - lmv->desc.ld_active_tgt_count, - lmv->desc.ld_qos_maxage, true); + rc = ltd_qos_penalties_calc(&lmv->lmv_mdt_descs); if (rc) GOTO(unlock, tgt = ERR_PTR(rc)); @@ -1585,7 +1559,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt) continue; tgt->ltd_qos.ltq_usable = 1; - lqos_calc_weight(tgt); + lu_tgt_qos_weight_calc(tgt); total_weight += tgt->ltd_qos.ltq_weight; } @@ -1600,9 +1574,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt) continue; *mdt = tgt->ltd_index; - lqos_recalc_weight(&lmv->lmv_qos, &lmv->lmv_mdt_descs, tgt, - lmv->desc.ld_active_tgt_count, - &total_weight); + ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight); GOTO(unlock, rc = 0); } @@ -1623,14 +1595,16 @@ static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt) ENTRY; spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc); - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - index = (i + lmv->lmv_qos_rr_index) % lmv->desc.ld_tgt_count; + for (i = 0; i < lmv->lmv_mdt_descs.ltd_tgts_size; i++) { + index = (i + lmv->lmv_qos_rr_index) % + lmv->lmv_mdt_descs.ltd_tgts_size; tgt = lmv_tgt(lmv, index); if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) continue; *mdt = tgt->ltd_index; - lmv->lmv_qos_rr_index = (*mdt + 1) % lmv->desc.ld_tgt_count; + lmv->lmv_qos_rr_index = (*mdt + 1) % + lmv->lmv_mdt_descs.ltd_tgts_size; spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc); RETURN(tgt); @@ -1825,7 +1799,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, ENTRY; - if (!lmv->desc.ld_active_tgt_count) + if (!lmv->lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count) RETURN(-EIO); if (lmv_dir_bad_hash(op_data->op_mea1)) @@ -2964,7 +2938,7 @@ static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, exp->exp_connect_data = *(struct obd_connect_data *)val; RETURN(rc); } else if (KEY_IS(KEY_TGT_COUNT)) { - *((int *)val) = lmv->desc.ld_tgt_count; + *((int *)val) = lmv->lmv_mdt_descs.ltd_tgts_size; RETURN(0); } @@ -2978,7 +2952,7 @@ static int lmv_rmfid(struct obd_export *exp, struct fid_array *fa, struct obd_device *obddev = class_exp2obd(exp); struct ptlrpc_request_set *set = _set; struct lmv_obd *lmv = &obddev->u.lmv; - int tgt_count = lmv->desc.ld_tgt_count; + int tgt_count = lmv->lmv_mdt_count; struct lu_tgt_desc *tgt; struct fid_array *fat, **fas = NULL; int i, rc, **rcs = NULL; @@ -3365,8 +3339,8 @@ enum ldlm_mode lmv_lock_match(struct obd_export *exp, __u64 flags, * since this can be easily found, and only try others if that fails. */ for (i = 0, index = lmv_fid2tgt_index(lmv, fid); - i < lmv->desc.ld_tgt_count; - i++, index = (index + 1) % lmv->desc.ld_tgt_count) { + i < lmv->lmv_mdt_descs.ltd_tgts_size; + i++, index = (index + 1) % lmv->lmv_mdt_descs.ltd_tgts_size) { if (index < 0) { CDEBUG(D_HA, "%s: "DFID" is inaccessible: rc = %d\n", obd->obd_name, PFID(fid), index); diff --git a/lustre/lmv/lproc_lmv.c b/lustre/lmv/lproc_lmv.c index 52f5953..97faa02 100644 --- a/lustre/lmv/lproc_lmv.c +++ b/lustre/lmv/lproc_lmv.c @@ -44,10 +44,8 @@ static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, { struct obd_device *dev = container_of(kobj, struct obd_device, obd_kset.kobj); - struct lmv_desc *desc; - desc = &dev->u.lmv.desc; - return sprintf(buf, "%u\n", desc->ld_tgt_count); + return sprintf(buf, "%u\n", dev->u.lmv.lmv_mdt_count); } LUSTRE_RO_ATTR(numobd); @@ -56,10 +54,9 @@ static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, { struct obd_device *dev = container_of(kobj, struct obd_device, obd_kset.kobj); - struct lmv_desc *desc; - desc = &dev->u.lmv.desc; - return sprintf(buf, "%u\n", desc->ld_active_tgt_count); + return sprintf(buf, "%u\n", + dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_active_tgt_count); } LUSTRE_RO_ATTR(activeobd); @@ -68,10 +65,9 @@ static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr, { struct obd_device *dev = container_of(kobj, struct obd_device, obd_kset.kobj); - struct lmv_desc *desc; - desc = &dev->u.lmv.desc; - return sprintf(buf, "%s\n", desc->ld_uuid.uuid); + return sprintf(buf, "%s\n", + dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_uuid.uuid); } LUSTRE_RO_ATTR(desc_uuid); @@ -82,7 +78,8 @@ static ssize_t qos_maxage_show(struct kobject *kobj, struct obd_device *dev = container_of(kobj, struct obd_device, obd_kset.kobj); - return sprintf(buf, "%u\n", dev->u.lmv.desc.ld_qos_maxage); + return sprintf(buf, "%u\n", + dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage); } static ssize_t qos_maxage_store(struct kobject *kobj, @@ -99,7 +96,7 @@ static ssize_t qos_maxage_store(struct kobject *kobj, if (rc) return rc; - dev->u.lmv.desc.ld_qos_maxage = val; + dev->u.lmv.lmv_mdt_descs.ltd_lmv_desc.ld_qos_maxage = val; return count; } diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index 99ccbb0..a857df1 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -358,8 +358,7 @@ static int lod_sub_recovery_thread(void *arg) struct llog_ctxt *ctxt = NULL; struct lu_env env; struct lu_target *lut; - struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; - struct lod_tgt_desc *tgt = NULL; + struct lu_tgt_desc *mdt = NULL; time64_t start; int retries = 0; int rc; @@ -452,8 +451,8 @@ again: GOTO(out, rc = 0); } - ltd_foreach_tgt(ltd, tgt) { - if (!tgt->ltd_got_update_log) { + lod_foreach_mdt(lod, mdt) { + if (!mdt->ltd_got_update_log) { spin_unlock(&lod->lod_lock); GOTO(out, rc = 0); } @@ -616,12 +615,12 @@ int lod_sub_init_llog(const struct lu_env *env, struct lod_device *lod, thread = &lod->lod_child_recovery_thread; index = master_index; } else { - struct lu_tgt_desc *tgt; + struct lu_tgt_desc *mdt; - ltd_foreach_tgt(&lod->lod_mdt_descs, tgt) { - if (tgt->ltd_tgt == dt) { - index = tgt->ltd_index; - subtgt = tgt; + lod_foreach_mdt(lod, mdt) { + if (mdt->ltd_tgt == dt) { + index = mdt->ltd_index; + subtgt = mdt; break; } } @@ -688,9 +687,8 @@ free_lrd: static void lod_sub_stop_recovery_threads(const struct lu_env *env, struct lod_device *lod) { - struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; struct ptlrpc_thread *thread; - struct lu_tgt_desc *tgt; + struct lu_tgt_desc *mdt; /* * Stop the update log commit cancel threads and finish master @@ -704,20 +702,19 @@ static void lod_sub_stop_recovery_threads(const struct lu_env *env, wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPED); } - lod_getref(ltd); - ltd_foreach_tgt(ltd, tgt) { - thread = tgt->ltd_recovery_thread; + lod_getref(&lod->lod_mdt_descs); + lod_foreach_mdt(lod, mdt) { + thread = mdt->ltd_recovery_thread; if (thread && thread->t_flags & SVC_RUNNING) { thread->t_flags = SVC_STOPPING; wake_up(&thread->t_ctl_waitq); wait_event(thread->t_ctl_waitq, thread->t_flags & SVC_STOPPED); - OBD_FREE_PTR(tgt->ltd_recovery_thread); - tgt->ltd_recovery_thread = NULL; + OBD_FREE_PTR(mdt->ltd_recovery_thread); + mdt->ltd_recovery_thread = NULL; } } - - lod_putref(lod, ltd); + lod_putref(lod, &lod->lod_mdt_descs); } /** @@ -731,8 +728,7 @@ static void lod_sub_stop_recovery_threads(const struct lu_env *env, static void lod_sub_fini_all_llogs(const struct lu_env *env, struct lod_device *lod) { - struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; - struct lu_tgt_desc *tgt; + struct lu_tgt_desc *mdt; /* * Stop the update log commit cancel threads and finish master @@ -740,19 +736,18 @@ static void lod_sub_fini_all_llogs(const struct lu_env *env, */ lod_sub_fini_llog(env, lod->lod_child, &lod->lod_child_recovery_thread); - lod_getref(ltd); - ltd_foreach_tgt(ltd, tgt) - lod_sub_fini_llog(env, tgt->ltd_tgt, - tgt->ltd_recovery_thread); - lod_putref(lod, ltd); + lod_getref(&lod->lod_mdt_descs); + lod_foreach_mdt(lod, mdt) + lod_sub_fini_llog(env, mdt->ltd_tgt, + mdt->ltd_recovery_thread); + lod_putref(lod, &lod->lod_mdt_descs); } static char *lod_show_update_logs_retrievers(void *data, int *size, int *count) { struct lod_device *lod = (struct lod_device *)data; struct lu_target *lut = lod2lu_dev(lod)->ld_site->ls_tgt; - struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; - struct lod_tgt_desc *tgt = NULL; + struct lu_tgt_desc *mdt = NULL; char *buf; int len = 0; int rc; @@ -783,10 +778,10 @@ static char *lod_show_update_logs_retrievers(void *data, int *size, int *count) (*count)++; } - ltd_foreach_tgt(ltd, tgt) { - if (!tgt->ltd_got_update_log) { + lod_foreach_mdt(lod, mdt) { + if (!mdt->ltd_got_update_log) { rc = snprintf(buf + len, *size - len, " %04x", - tgt->ltd_index); + mdt->ltd_index); if (unlikely(rc <= 0)) break; @@ -954,9 +949,8 @@ static int lod_process_config(const struct lu_env *env, rc = lod_add_device(env, lod, arg1, index, gen, mdt_index, LUSTRE_OSC_NAME, 0); } else { - rc = lod_del_device(env, lod, - &lod->lod_ost_descs, - arg1, index, gen, true); + rc = lod_del_device(env, lod, &lod->lod_ost_descs, + arg1, index, gen); } break; @@ -973,9 +967,8 @@ static int lod_process_config(const struct lu_env *env, */ param = lustre_cfg_buf(lcfg, 1); if (strstr(param, "osp") && strstr(param, ".active=")) { - struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; struct lod_tgt_desc *sub_tgt = NULL; - struct lu_tgt_desc *tgt; + struct lu_tgt_desc *mdt; char *ptr; char *tmp; @@ -989,9 +982,9 @@ static int lod_process_config(const struct lu_env *env, GOTO(out, rc); } - ltd_foreach_tgt(ltd, tgt) { - if (tgt->ltd_tgt->dd_lu_dev.ld_obd == obd) { - sub_tgt = tgt; + lod_foreach_mdt(lod, mdt) { + if (mdt->ltd_tgt->dd_lu_dev.ld_obd == obd) { + sub_tgt = mdt; break; } } @@ -1107,7 +1100,7 @@ static int lod_recovery_complete(const struct lu_env *env, { struct lod_device *lod = lu2lod_dev(dev); struct lu_device *next = &lod->lod_child->dd_lu_dev; - unsigned int i; + struct lod_tgt_desc *tgt; int rc; ENTRY; @@ -1118,17 +1111,15 @@ static int lod_recovery_complete(const struct lu_env *env, rc = next->ld_ops->ldo_recovery_complete(env, next); lod_getref(&lod->lod_ost_descs); - if (lod->lod_osts_size > 0) { - cfs_foreach_bit(lod->lod_ost_bitmap, i) { - struct lod_tgt_desc *tgt; - - tgt = OST_TGT(lod, i); + if (lod->lod_ost_descs.ltd_tgts_size > 0) { + lod_foreach_ost(lod, tgt) { LASSERT(tgt && tgt->ltd_tgt); - next = &tgt->ltd_ost->dd_lu_dev; + next = &tgt->ltd_tgt->dd_lu_dev; rc = next->ld_ops->ldo_recovery_complete(env, next); if (rc) CERROR("%s: can't complete recovery on #%d: rc = %d\n", - lod2obd(lod)->obd_name, i, rc); + lod2obd(lod)->obd_name, tgt->ltd_index, + rc); } } lod_putref(lod, &lod->lod_ost_descs); @@ -1149,8 +1140,7 @@ static int lod_recovery_complete(const struct lu_env *env, */ static int lod_sub_init_llogs(const struct lu_env *env, struct lod_device *lod) { - struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; - struct lu_tgt_desc *tgt; + struct lu_tgt_desc *mdt; int rc; ENTRY; @@ -1166,8 +1156,8 @@ static int lod_sub_init_llogs(const struct lu_env *env, struct lod_device *lod) if (rc < 0) RETURN(rc); - ltd_foreach_tgt(ltd, tgt) { - rc = lod_sub_init_llog(env, lod, tgt->ltd_tgt); + lod_foreach_mdt(lod, mdt) { + rc = lod_sub_init_llog(env, lod, mdt->ltd_tgt); if (rc != 0) break; } @@ -1299,12 +1289,11 @@ static int lod_statfs(const struct lu_env *env, struct dt_device *dev, struct obd_statfs *sfs, struct obd_statfs_info *info) { struct lod_device *lod = dt2lod_dev(dev); - struct lod_ost_desc *ost; - struct lod_mdt_desc *mdt; + struct lu_tgt_desc *tgt; struct obd_statfs ost_sfs; u64 ost_files = 0; u64 ost_ffree = 0; - int i, rc, bs; + int rc, bs; rc = dt_statfs(env, dt2lod_dev(dev)->lod_child, sfs); if (rc) @@ -1318,10 +1307,8 @@ static int lod_statfs(const struct lu_env *env, struct dt_device *dev, sfs->os_granted = 0; lod_getref(&lod->lod_mdt_descs); - lod_foreach_mdt(lod, i) { - mdt = MDT_TGT(lod, i); - LASSERT(mdt && mdt->ltd_mdt); - rc = dt_statfs(env, mdt->ltd_mdt, &ost_sfs); + lod_foreach_mdt(lod, tgt) { + rc = dt_statfs(env, tgt->ltd_tgt, &ost_sfs); /* ignore errors */ if (rc) continue; @@ -1337,10 +1324,8 @@ static int lod_statfs(const struct lu_env *env, struct dt_device *dev, * just fallback to pre-DoM policy if any OST is alive */ lod_getref(&lod->lod_ost_descs); - lod_foreach_ost(lod, i) { - ost = OST_TGT(lod, i); - LASSERT(ost && ost->ltd_ost); - rc = dt_statfs(env, ost->ltd_ost, &ost_sfs); + lod_foreach_ost(lod, tgt) { + rc = dt_statfs(env, tgt->ltd_tgt, &ost_sfs); /* ignore errors */ if (rc || ost_sfs.os_bsize == 0) continue; @@ -1506,24 +1491,21 @@ static void lod_conf_get(const struct lu_env *env, static int lod_sync(const struct lu_env *env, struct dt_device *dev) { struct lod_device *lod = dt2lod_dev(dev); - struct lod_ost_desc *ost; - struct lod_mdt_desc *mdt; - unsigned int i; + struct lu_tgt_desc *tgt; int rc = 0; ENTRY; lod_getref(&lod->lod_ost_descs); - lod_foreach_ost(lod, i) { - ost = OST_TGT(lod, i); - LASSERT(ost && ost->ltd_ost); - if (!ost->ltd_active) + lod_foreach_ost(lod, tgt) { + if (!tgt->ltd_active) continue; - rc = dt_sync(env, ost->ltd_ost); + rc = dt_sync(env, tgt->ltd_tgt); if (rc) { if (rc != -ENOTCONN) { CERROR("%s: can't sync ost %u: rc = %d\n", - lod2obd(lod)->obd_name, i, rc); + lod2obd(lod)->obd_name, tgt->ltd_index, + rc); break; } rc = 0; @@ -1535,16 +1517,15 @@ static int lod_sync(const struct lu_env *env, struct dt_device *dev) RETURN(rc); lod_getref(&lod->lod_mdt_descs); - lod_foreach_mdt(lod, i) { - mdt = MDT_TGT(lod, i); - LASSERT(mdt && mdt->ltd_mdt); - if (!mdt->ltd_active) + lod_foreach_mdt(lod, tgt) { + if (!tgt->ltd_active) continue; - rc = dt_sync(env, mdt->ltd_mdt); + rc = dt_sync(env, tgt->ltd_tgt); if (rc) { if (rc != -ENOTCONN) { CERROR("%s: can't sync mdt %u: rc = %d\n", - lod2obd(lod)->obd_name, i, rc); + lod2obd(lod)->obd_name, tgt->ltd_index, + rc); break; } rc = 0; @@ -1754,8 +1735,8 @@ static int lod_init0(const struct lu_env *env, struct lod_device *lod, spin_lock_init(&lod->lod_lock); spin_lock_init(&lod->lod_connects_lock); - lu_tgt_descs_init(&lod->lod_mdt_descs); - lu_tgt_descs_init(&lod->lod_ost_descs); + lu_tgt_descs_init(&lod->lod_mdt_descs, true); + lu_tgt_descs_init(&lod->lod_ost_descs, false); RETURN(0); @@ -1852,12 +1833,12 @@ static struct lu_device *lod_device_fini(const struct lu_env *env, lod_procfs_fini(lod); - rc = lod_fini_tgt(env, lod, &lod->lod_ost_descs, true); + rc = lod_fini_tgt(env, lod, &lod->lod_ost_descs); if (rc) CERROR("%s: can not fini ost descriptors: rc = %d\n", lod2obd(lod)->obd_name, rc); - rc = lod_fini_tgt(env, lod, &lod->lod_mdt_descs, false); + rc = lod_fini_tgt(env, lod, &lod->lod_mdt_descs); if (rc) CERROR("%s: can not fini mdt descriptors: rc = %d\n", lod2obd(lod)->obd_name, rc); @@ -2039,7 +2020,6 @@ static int lod_obd_get_info(const struct lu_env *env, struct obd_export *exp, struct obd_device *obd = exp->exp_obd; struct lod_device *d; struct lod_tgt_desc *tgt; - unsigned int i; int rc = 1; if (!obd->obd_set_up || obd->obd_stopping) @@ -2047,9 +2027,7 @@ static int lod_obd_get_info(const struct lu_env *env, struct obd_export *exp, d = lu2lod_dev(obd->obd_lu_dev); lod_getref(&d->lod_ost_descs); - lod_foreach_ost(d, i) { - tgt = OST_TGT(d, i); - LASSERT(tgt && tgt->ltd_tgt); + lod_foreach_ost(d, tgt) { rc = obd_get_info(env, tgt->ltd_exp, keylen, key, vallen, val); /* one healthy device is enough */ @@ -2059,12 +2037,9 @@ static int lod_obd_get_info(const struct lu_env *env, struct obd_export *exp, lod_putref(d, &d->lod_ost_descs); lod_getref(&d->lod_mdt_descs); - lod_foreach_mdt(d, i) { + lod_foreach_mdt(d, tgt) { struct llog_ctxt *ctxt; - tgt = MDT_TGT(d, i); - LASSERT(tgt != NULL); - LASSERT(tgt->ltd_tgt != NULL); if (!tgt->ltd_active) continue; @@ -2105,7 +2080,7 @@ static int lod_obd_set_info_async(const struct lu_env *env, struct lod_device *d; struct lod_tgt_desc *tgt; int no_set = 0; - int i, rc = 0, rc2; + int rc = 0, rc2; ENTRY; @@ -2118,9 +2093,7 @@ static int lod_obd_set_info_async(const struct lu_env *env, d = lu2lod_dev(obd->obd_lu_dev); lod_getref(&d->lod_ost_descs); - lod_foreach_ost(d, i) { - tgt = OST_TGT(d, i); - LASSERT(tgt && tgt->ltd_tgt); + lod_foreach_ost(d, tgt) { if (!tgt->ltd_active) continue; @@ -2132,9 +2105,7 @@ static int lod_obd_set_info_async(const struct lu_env *env, lod_putref(d, &d->lod_ost_descs); lod_getref(&d->lod_mdt_descs); - lod_foreach_mdt(d, i) { - tgt = MDT_TGT(d, i); - LASSERT(tgt && tgt->ltd_tgt); + lod_foreach_mdt(d, tgt) { if (!tgt->ltd_active) continue; rc2 = obd_set_info_async(env, tgt->ltd_exp, keylen, key, diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index a7b4eed..bf517f9 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -56,7 +56,7 @@ struct pool_desc { char pool_name[LOV_MAXPOOLNAME + 1]; - struct ost_pool pool_obds; /* pool members */ + struct lu_tgt_pool pool_obds; /* pool members */ atomic_t pool_refcount; struct lu_qos_rr pool_rr; struct hlist_node pool_hash; /* access by poolname */ @@ -100,9 +100,6 @@ struct lod_device { lod_lmv_failout:1, lod_child_got_update_log:1; - /* lov settings descriptor storing static information */ - struct lov_desc lod_desc; - /* protect ld_active_tgt_count, ltd_active and lod_md_root */ spinlock_t lod_lock; @@ -119,14 +116,7 @@ struct lod_device { /* maximum size of MDT stripe for Data-on-MDT files. */ unsigned int lod_dom_max_stripesize; - /*FIXME: When QOS and pool is implemented for MDT, probably these - * structure should be moved to lod_tgt_descs as well. - */ - /* QoS info per LOD */ - struct lu_qos lod_qos; /* qos info per lod */ - /* OST pool data */ - struct ost_pool lod_pool_info; /* all OSTs in a packed array */ int lod_pool_count; struct cfs_hash *lod_pools_hash_body; /* used for key access */ struct list_head lod_pool_list; /* used for sequential access */ @@ -141,19 +131,9 @@ struct lod_device { struct lod_object *lod_md_root; }; -#define lod_osts lod_ost_descs.ltd_tgts -#define lod_ost_bitmap lod_ost_descs.ltd_tgt_bitmap -#define lod_ostnr lod_ost_descs.ltd_tgtnr -#define lod_osts_size lod_ost_descs.ltd_tgts_size -#define ltd_ost ltd_tgt -#define lod_ost_desc lu_tgt_desc - -#define lod_mdts lod_mdt_descs.ltd_tgts -#define lod_mdt_bitmap lod_mdt_descs.ltd_tgt_bitmap -#define lod_remote_mdt_count lod_mdt_descs.ltd_tgtnr -#define lod_mdts_size lod_mdt_descs.ltd_tgts_size -#define ltd_mdt ltd_tgt -#define lod_mdt_desc lu_tgt_desc +#define lod_ost_bitmap lod_ost_descs.ltd_tgt_bitmap +#define lod_ost_count lod_ost_descs.ltd_lov_desc.ld_tgt_count +#define lod_remote_mdt_count lod_mdt_descs.ltd_lmv_desc.ld_tgt_count struct lod_layout_component { struct lu_extent llc_extent; @@ -168,7 +148,7 @@ struct lod_layout_component { __u64 llc_timestamp; /* snapshot time */ char *llc_pool; /* ost list specified with LOV_USER_MAGIC_SPECIFIC lum */ - struct ost_pool llc_ostlist; + struct lu_tgt_pool llc_ostlist; struct dt_object **llc_stripe; __u32 *llc_ost_indices; }; @@ -506,12 +486,8 @@ static inline void lod_layout_get_pool(struct lod_layout_component *entries, } } -#define lod_foreach_ost(__dev, index) \ - if ((__dev)->lod_osts_size > 0) \ - cfs_foreach_bit((__dev)->lod_ost_bitmap, (index)) - -#define lod_foreach_mdt(mdt_dev, index) \ - cfs_foreach_bit((mdt_dev)->lod_mdt_bitmap, (index)) +#define lod_foreach_mdt(lod, mdt) ltd_foreach_tgt(&(lod)->lod_mdt_descs, mdt) +#define lod_foreach_ost(lod, ost) ltd_foreach_tgt(&(lod)->lod_ost_descs, ost) /* lod_dev.c */ extern struct kmem_cache *lod_object_kmem; @@ -531,10 +507,10 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, char *osp, unsigned index, unsigned gen, int mdt_index, char *type, int active); int lod_del_device(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, char *osp, unsigned idx, - unsigned gen, bool for_ost); + struct lod_tgt_descs *ltd, char *osp, unsigned int idx, + unsigned int gen); int lod_fini_tgt(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, bool for_ost); + struct lod_tgt_descs *ltd); int lod_striping_load(const struct lu_env *env, struct lod_object *lo); int lod_striping_reload(const struct lu_env *env, struct lod_object *lo, const struct lu_buf *buf); @@ -622,14 +598,14 @@ int lod_alloc_comp_entries(struct lod_object *lo, int mirror_cnt, int comp_cnt); int lod_fill_mirrors(struct lod_object *lo); /* lod_pool.c */ -int lod_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count); -int lod_ost_pool_remove(struct ost_pool *op, __u32 idx); -int lod_ost_pool_extend(struct ost_pool *op, unsigned int min_count); +int lod_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count); +int lod_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx); +int lod_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count); struct pool_desc *lod_find_pool(struct lod_device *lod, char *poolname); void lod_pool_putref(struct pool_desc *pool); -int lod_ost_pool_free(struct ost_pool *op); +int lod_ost_pool_free(struct lu_tgt_pool *op); int lod_pool_del(struct obd_device *obd, char *poolname); -int lod_ost_pool_init(struct ost_pool *op, unsigned int count); +int lod_ost_pool_init(struct lu_tgt_pool *op, unsigned int count); extern struct cfs_hash_ops pool_hash_operations; int lod_check_index_in_pool(__u32 idx, struct pool_desc *pool); int lod_pool_new(struct obd_device *obd, char *poolname); diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index 95bf0cc..6590d3c 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -77,28 +77,23 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) if (ltd->ltd_refcount == 0 && ltd->ltd_death_row) { struct lod_tgt_desc *tgt_desc, *tmp; struct list_head kill; - unsigned int idx; CDEBUG(D_CONFIG, "destroying %d ltd desc\n", ltd->ltd_death_row); INIT_LIST_HEAD(&kill); - cfs_foreach_bit(ltd->ltd_tgt_bitmap, idx) { - tgt_desc = LTD_TGT(ltd, idx); + ltd_foreach_tgt_safe(ltd, tgt_desc, tmp) { LASSERT(tgt_desc); - if (!tgt_desc->ltd_reap) continue; list_add(&tgt_desc->ltd_kill, &kill); /*FIXME: only support ost pool for now */ - if (ltd == &lod->lod_ost_descs) { - lod_ost_pool_remove(&lod->lod_pool_info, idx); - if (tgt_desc->ltd_active) - lod->lod_desc.ld_active_tgt_count--; - } - lu_tgt_descs_del(ltd, tgt_desc); + if (ltd == &lod->lod_ost_descs) + lod_ost_pool_remove(<d->ltd_tgt_pool, + tgt_desc->ltd_index); + ltd_del_tgt(ltd, tgt_desc); ltd->ltd_death_row--; } mutex_unlock(<d->ltd_mutex); @@ -106,17 +101,8 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) list_for_each_entry_safe(tgt_desc, tmp, &kill, ltd_kill) { int rc; + list_del(&tgt_desc->ltd_kill); - if (ltd == &lod->lod_ost_descs) { - /* remove from QoS structures */ - rc = lqos_del_tgt(&lod->lod_qos, tgt_desc); - if (rc) - CERROR("%s: qos_del_tgt(%s) failed:" - "rc = %d\n", - lod2obd(lod)->obd_name, - obd_uuid2str(&tgt_desc->ltd_uuid), - rc); - } rc = obd_disconnect(tgt_desc->ltd_exp); if (rc) CERROR("%s: failed to disconnect %s: rc = %d\n", @@ -262,30 +248,25 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, down_write(<d->ltd_rw_sem); mutex_lock(<d->ltd_mutex); - lu_tgt_descs_add(ltd, tgt_desc); + rc = ltd_add_tgt(ltd, tgt_desc); + if (rc) + GOTO(out_mutex, rc); + + rc = lu_qos_add_tgt(<d->ltd_qos, tgt_desc); + if (rc) + GOTO(out_del_tgt, rc); + if (for_ost) { - /* pool and qos are not supported for MDS stack yet */ - rc = lod_ost_pool_add(&lod->lod_pool_info, index, - lod->lod_osts_size); + /* pool is not supported for MDS stack yet */ + rc = lod_ost_pool_add(<d->ltd_tgt_pool, index, + ltd->ltd_tgts_size); if (rc) { CERROR("%s: can't set up pool, failed with %d\n", obd->obd_name, rc); - GOTO(out_mutex, rc); - } - - rc = lqos_add_tgt(&lod->lod_qos, tgt_desc); - if (rc) { - CERROR("%s: qos_add_tgt failed with %d\n", - obd->obd_name, rc); - GOTO(out_pool, rc); + GOTO(out_del_tgt, rc); } - - /* The new OST is now a full citizen */ - if (index >= lod->lod_desc.ld_tgt_count) - lod->lod_desc.ld_tgt_count = index + 1; - if (active) - lod->lod_desc.ld_active_tgt_count++; } + mutex_unlock(<d->ltd_mutex); up_write(<d->ltd_rw_sem); @@ -320,10 +301,10 @@ out_ltd: thread = LTD_TGT(ltd, index)->ltd_recovery_thread; OBD_FREE_PTR(thread); } -out_pool: - lod_ost_pool_remove(&lod->lod_pool_info, index); + lod_ost_pool_remove(<d->ltd_tgt_pool, index); +out_del_tgt: + ltd_del_tgt(ltd, tgt_desc); out_mutex: - lu_tgt_descs_del(ltd, tgt_desc); mutex_unlock(<d->ltd_mutex); up_write(<d->ltd_rw_sem); OBD_FREE_PTR(tgt_desc); @@ -352,27 +333,19 @@ out_cleanup: * \param[in] env execution environment for this thread * \param[in] lod LOD device the target table belongs to * \param[in] ltd target table - * \param[in] idx index of the target - * \param[in] for_ost type of the target: 0 - MDT, 1 - OST + * \param[in] tgt target */ static void __lod_del_device(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, unsigned idx, - bool for_ost) + struct lod_tgt_descs *ltd, struct lu_tgt_desc *tgt) { - LASSERT(LTD_TGT(ltd, idx)); - - lfsck_del_target(env, lod->lod_child, LTD_TGT(ltd, idx)->ltd_tgt, - idx, for_ost); + lfsck_del_target(env, lod->lod_child, tgt->ltd_tgt, tgt->ltd_index, + !ltd->ltd_is_mdt); - if (!for_ost && LTD_TGT(ltd, idx)->ltd_recovery_thread != NULL) { - struct ptlrpc_thread *thread; + if (ltd->ltd_is_mdt && tgt->ltd_recovery_thread) + OBD_FREE_PTR(tgt->ltd_recovery_thread); - thread = LTD_TGT(ltd, idx)->ltd_recovery_thread; - OBD_FREE_PTR(thread); - } - - if (LTD_TGT(ltd, idx)->ltd_reap == 0) { - LTD_TGT(ltd, idx)->ltd_reap = 1; + if (!tgt->ltd_reap) { + tgt->ltd_reap = 1; ltd->ltd_death_row++; } } @@ -385,22 +358,21 @@ static void __lod_del_device(const struct lu_env *env, struct lod_device *lod, * \param[in] env execution environment for this thread * \param[in] lod LOD device the target table belongs to * \param[in] ltd target table - * \param[in] for_ost type of the target: MDT or OST * * \retval 0 always */ int lod_fini_tgt(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, bool for_ost) + struct lod_tgt_descs *ltd) { - unsigned int idx; + struct lu_tgt_desc *tgt; if (ltd->ltd_tgts_size <= 0) return 0; lod_getref(ltd); mutex_lock(<d->ltd_mutex); - cfs_foreach_bit(ltd->ltd_tgt_bitmap, idx) - __lod_del_device(env, lod, ltd, idx, for_ost); + ltd_foreach_tgt(ltd, tgt) + __lod_del_device(env, lod, ltd, tgt); mutex_unlock(<d->ltd_mutex); lod_putref(lod, ltd); @@ -421,18 +393,19 @@ int lod_fini_tgt(const struct lu_env *env, struct lod_device *lod, * \param[in] osp name of OSP device to be removed * \param[in] idx index of the target * \param[in] gen generation number, not used currently - * \param[in] for_ost type of the target: 0 - MDT, 1 - OST * * \retval 0 if the device was scheduled for removal * \retval -EINVAL if no device was found */ int lod_del_device(const struct lu_env *env, struct lod_device *lod, - struct lod_tgt_descs *ltd, char *osp, unsigned idx, - unsigned gen, bool for_ost) + struct lod_tgt_descs *ltd, char *osp, unsigned int idx, + unsigned int gen) { struct obd_device *obd; - int rc = 0; - struct obd_uuid uuid; + struct lu_tgt_desc *tgt; + struct obd_uuid uuid; + int rc = 0; + ENTRY; CDEBUG(D_CONFIG, "osp:%s idx:%d gen:%d\n", osp, idx, gen); @@ -456,22 +429,21 @@ int lod_del_device(const struct lu_env *env, struct lod_device *lod, lod_getref(ltd); mutex_lock(<d->ltd_mutex); + tgt = LTD_TGT(ltd, idx); /* check that the index is allocated in the bitmap */ - if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, idx) || - !LTD_TGT(ltd, idx)) { + if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, idx) || !tgt) { CERROR("%s: device %d is not set up\n", obd->obd_name, idx); GOTO(out, rc = -EINVAL); } /* check that the UUID matches */ - if (!obd_uuid_equals(&uuid, <D_TGT(ltd, idx)->ltd_uuid)) { + if (!obd_uuid_equals(&uuid, &tgt->ltd_uuid)) { CERROR("%s: LOD target UUID %s at index %d does not match %s\n", - obd->obd_name, obd_uuid2str(<D_TGT(ltd,idx)->ltd_uuid), - idx, osp); + obd->obd_name, obd_uuid2str(&tgt->ltd_uuid), idx, osp); GOTO(out, rc = -EINVAL); } - __lod_del_device(env, lod, ltd, idx, for_ost); + __lod_del_device(env, lod, ltd, tgt); EXIT; out: mutex_unlock(<d->ltd_mutex); @@ -1020,7 +992,7 @@ static int validate_lod_and_idx(struct lod_device *md, __u32 idx) return -EINVAL; } - if (unlikely(OST_TGT(md, idx)->ltd_ost == NULL)) { + if (unlikely(OST_TGT(md, idx)->ltd_tgt == NULL)) { CERROR("%s: invalid lod device, for idx: %d\n", lod2obd(md)->obd_name , idx); return -EINVAL; @@ -1098,7 +1070,7 @@ int lod_initialize_objects(const struct lu_env *env, struct lod_object *lo, GOTO(out, rc); } - nd = &OST_TGT(md,idx)->ltd_ost->dd_lu_dev; + nd = &OST_TGT(md, idx)->ltd_tgt->dd_lu_dev; lod_putref(md, &md->lod_ost_descs); /* In the function below, .hs_keycmp resolves to @@ -1571,9 +1543,9 @@ static int lod_verify_v1v3(struct lod_device *d, const struct lu_buf *buf, if (!is_from_disk && stripe_offset != LOV_OFFSET_DEFAULT && lov_pattern(le32_to_cpu(lum->lmm_pattern)) != LOV_PATTERN_MDT) { /* if offset is not within valid range [0, osts_size) */ - if (stripe_offset >= d->lod_osts_size) { + if (stripe_offset >= d->lod_ost_descs.ltd_tgts_size) { CDEBUG(D_LAYOUT, "stripe offset %u >= bitmap size %u\n", - stripe_offset, d->lod_osts_size); + stripe_offset, d->lod_ost_descs.ltd_tgts_size); GOTO(out, rc = -EINVAL); } @@ -1765,7 +1737,7 @@ int lod_fix_dom_stripe(struct lod_device *d, struct lov_comp_md_v1 *comp_v1, int lod_verify_striping(struct lod_device *d, struct lod_object *lo, const struct lu_buf *buf, bool is_from_disk) { - struct lov_desc *desc = &d->lod_desc; + struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc; struct lov_user_md_v1 *lum; struct lov_comp_md_v1 *comp_v1; struct lov_comp_md_entry_v1 *ent; @@ -2102,22 +2074,10 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg) lod_fix_desc(desc); desc->ld_active_tgt_count = 0; - lod->lod_desc = *desc; + lod->lod_ost_descs.ltd_lov_desc = *desc; lod->lod_sp_me = LUSTRE_SP_CLI; - /* Set up allocation policy (QoS and RR) */ - INIT_LIST_HEAD(&lod->lod_qos.lq_svr_list); - init_rwsem(&lod->lod_qos.lq_rw_sem); - lod->lod_qos.lq_dirty = 1; - lod->lod_qos.lq_reset = 1; - /* Default priority is toward free space balance */ - lod->lod_qos.lq_prio_free = 232; - /* Default threshold for rr (roughly 17%) */ - lod->lod_qos.lq_threshold_rr = 43; - - lu_qos_rr_init(&lod->lod_qos.lq_rr); - /* Set up OST pool environment */ lod->lod_pools_hash_body = cfs_hash_create("POOLS", HASH_POOLS_CUR_BITS, HASH_POOLS_MAX_BITS, @@ -2131,17 +2091,17 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg) INIT_LIST_HEAD(&lod->lod_pool_list); lod->lod_pool_count = 0; - rc = lod_ost_pool_init(&lod->lod_pool_info, 0); + rc = lod_ost_pool_init(&lod->lod_ost_descs.ltd_tgt_pool, 0); if (rc) GOTO(out_hash, rc); - rc = lod_ost_pool_init(&lod->lod_qos.lq_rr.lqr_pool, 0); + rc = lod_ost_pool_init(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool, 0); if (rc) GOTO(out_pool_info, rc); RETURN(0); out_pool_info: - lod_ost_pool_free(&lod->lod_pool_info); + lod_ost_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); out_hash: cfs_hash_putref(lod->lod_pools_hash_body); @@ -2171,8 +2131,8 @@ int lod_pools_fini(struct lod_device *lod) } cfs_hash_putref(lod->lod_pools_hash_body); - lod_ost_pool_free(&(lod->lod_qos.lq_rr.lqr_pool)); - lod_ost_pool_free(&lod->lod_pool_info); + lod_ost_pool_free(&(lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool)); + lod_ost_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); RETURN(0); } diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index f26198d..c7c2a3a 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -1571,7 +1571,7 @@ static int lod_xattr_get(const struct lu_env *env, struct dt_object *dt, if (is_root && strcmp(XATTR_NAME_LOV, name) == 0) { struct lov_user_md *lum = buf->lb_buf; - struct lov_desc *desc = &dev->lod_desc; + struct lov_desc *desc = &dev->lod_ost_descs.ltd_lov_desc; if (buf->lb_buf == NULL) { rc = sizeof(*lum); @@ -2685,7 +2685,7 @@ inline __u16 lod_comp_entry_stripe_count(struct lod_object *lo, else if (lod_comp_inited(entry)) return entry->llc_stripe_count; else if ((__u16)-1 == entry->llc_stripe_count) - return lod->lod_desc.ld_tgt_count; + return lod->lod_ost_count; else return lod_get_stripe_count(lod, lo, entry->llc_stripe_count, false); @@ -2756,14 +2756,14 @@ static int lod_declare_layout_add(const struct lu_env *env, { struct lod_thread_info *info = lod_env_info(env); struct lod_layout_component *comp_array, *lod_comp, *old_array; - struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev); + struct lod_device *d = lu2lod_dev(dt->do_lu.lo_dev); struct dt_object *next = dt_object_child(dt); - struct lov_desc *desc = &d->lod_desc; - struct lod_object *lo = lod_dt_obj(dt); - struct lov_user_md_v3 *v3; - struct lov_comp_md_v1 *comp_v1 = buf->lb_buf; - __u32 magic; - int i, rc, array_cnt, old_array_cnt; + struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc; + struct lod_object *lo = lod_dt_obj(dt); + struct lov_user_md_v3 *v3; + struct lov_comp_md_v1 *comp_v1 = buf->lb_buf; + __u32 magic; + int i, rc, array_cnt, old_array_cnt; ENTRY; LASSERT(lo->ldo_is_composite); @@ -5065,10 +5065,11 @@ static void lod_striping_from_default(struct lod_object *lo, umode_t mode) { struct lod_device *d = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct lov_desc *desc = &d->lod_desc; int i, rc; if (lds->lds_def_striping_set && S_ISREG(mode)) { + struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc; + rc = lod_alloc_comp_entries(lo, lds->lds_def_mirror_cnt, lds->lds_def_comp_cnt); if (rc != 0) @@ -5204,7 +5205,8 @@ static void lod_ah_init(const struct lu_env *env, LASSERT(child); if (ah->dah_append_stripes == -1) - ah->dah_append_stripes = d->lod_desc.ld_tgt_count; + ah->dah_append_stripes = + d->lod_ost_descs.ltd_lov_desc.ld_tgt_count; if (likely(parent)) { nextp = dt_object_child(parent); @@ -5385,7 +5387,7 @@ out: } LASSERT(!lc->ldo_is_composite); lod_comp = &lc->ldo_comp_entries[0]; - desc = &d->lod_desc; + desc = &d->lod_ost_descs.ltd_lov_desc; lod_adjust_stripe_info(lod_comp, desc, ah->dah_append_stripes); if (ah->dah_append_pool && ah->dah_append_pool[0]) lod_obj_set_pool(lc, 0, ah->dah_append_pool); @@ -5673,14 +5675,12 @@ static int lod_declare_create(const struct lu_env *env, struct dt_object *dt, } else if (lo->ldo_dir_stripe_offset != ss->ss_node_id) { struct lod_device *lod; - struct lod_tgt_descs *ltd; - struct lod_tgt_desc *tgt = NULL; + struct lu_tgt_desc *mdt = NULL; bool found_mdt = false; lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - ltd = &lod->lod_mdt_descs; - ltd_foreach_tgt(ltd, tgt) { - if (tgt->ltd_index == + lod_foreach_mdt(lod, mdt) { + if (mdt->ltd_index == lo->ldo_dir_stripe_offset) { found_mdt = true; break; @@ -6443,7 +6443,7 @@ static bool lod_sel_osts_allowed(const struct lu_env *env, break; } - rc = dt_statfs_info(env, ost->ltd_ost, sfs, &info); + rc = dt_statfs_info(env, ost->ltd_tgt, sfs, &info); if (rc) { CDEBUG(D_LAYOUT, "statfs failed for ost %d, error %d\n", index, rc); diff --git a/lustre/lod/lod_pool.c b/lustre/lod/lod_pool.c index c1426f3..eb23de4 100644 --- a/lustre/lod/lod_pool.c +++ b/lustre/lod/lod_pool.c @@ -464,7 +464,7 @@ void lod_dump_pool(int level, struct pool_desc *pool) * \retval negative error number on failure */ #define POOL_INIT_COUNT 2 -int lod_ost_pool_init(struct ost_pool *op, unsigned int count) +int lod_ost_pool_init(struct lu_tgt_pool *op, unsigned int count) { ENTRY; @@ -496,7 +496,7 @@ int lod_ost_pool_init(struct ost_pool *op, unsigned int count) * \retval 0 on success * \retval negative error number on failure. */ -int lod_ost_pool_extend(struct ost_pool *op, unsigned int min_count) +int lod_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count) { __u32 *new; __u32 new_size; @@ -534,7 +534,7 @@ int lod_ost_pool_extend(struct ost_pool *op, unsigned int min_count) * \retval 0 if target could be added to the pool * \retval negative error if target \a idx was not added */ -int lod_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count) +int lod_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count) { unsigned int i; int rc = 0; @@ -574,7 +574,7 @@ out: * \retval 0 on success * \retval negative error number on failure */ -int lod_ost_pool_remove(struct ost_pool *op, __u32 idx) +int lod_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx) { unsigned int i; ENTRY; @@ -608,7 +608,7 @@ int lod_ost_pool_remove(struct ost_pool *op, __u32 idx) * * \retval 0 on success or if pool was already freed */ -int lod_ost_pool_free(struct ost_pool *op) +int lod_ost_pool_free(struct lu_tgt_pool *op) { ENTRY; @@ -766,11 +766,11 @@ int lod_pool_del(struct obd_device *obd, char *poolname) */ int lod_pool_add(struct obd_device *obd, char *poolname, char *ostname) { - struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev); - struct obd_uuid ost_uuid; - struct pool_desc *pool; - unsigned int idx; - int rc = -EINVAL; + struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev); + struct obd_uuid ost_uuid; + struct pool_desc *pool; + struct lu_tgt_desc *tgt; + int rc = -EINVAL; ENTRY; pool = cfs_hash_lookup(lod->lod_pools_hash_body, poolname); @@ -781,8 +781,8 @@ int lod_pool_add(struct obd_device *obd, char *poolname, char *ostname) /* search ost in lod array */ lod_getref(&lod->lod_ost_descs); - lod_foreach_ost(lod, idx) { - if (obd_uuid_equals(&ost_uuid, &OST_TGT(lod, idx)->ltd_uuid)) { + lod_foreach_ost(lod, tgt) { + if (obd_uuid_equals(&ost_uuid, &tgt->ltd_uuid)) { rc = 0; break; } @@ -791,7 +791,8 @@ int lod_pool_add(struct obd_device *obd, char *poolname, char *ostname) if (rc) GOTO(out, rc); - rc = lod_ost_pool_add(&pool->pool_obds, idx, lod->lod_osts_size); + rc = lod_ost_pool_add(&pool->pool_obds, tgt->ltd_index, + lod->lod_ost_descs.ltd_tgts_size); if (rc) GOTO(out, rc); @@ -823,11 +824,11 @@ out: */ int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname) { - struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev); - struct obd_uuid ost_uuid; - struct pool_desc *pool; - unsigned int idx; - int rc = -EINVAL; + struct lod_device *lod = lu2lod_dev(obd->obd_lu_dev); + struct lu_tgt_desc *ost; + struct obd_uuid ost_uuid; + struct pool_desc *pool; + int rc = -EINVAL; ENTRY; pool = cfs_hash_lookup(lod->lod_pools_hash_body, poolname); @@ -837,8 +838,8 @@ int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname) obd_str2uuid(&ost_uuid, ostname); lod_getref(&lod->lod_ost_descs); - cfs_foreach_bit(lod->lod_ost_bitmap, idx) { - if (obd_uuid_equals(&ost_uuid, &OST_TGT(lod, idx)->ltd_uuid)) { + lod_foreach_ost(lod, ost) { + if (obd_uuid_equals(&ost_uuid, &ost->ltd_uuid)) { rc = 0; break; } @@ -848,7 +849,7 @@ int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname) if (rc) GOTO(out, rc); - lod_ost_pool_remove(&pool->pool_obds, idx); + lod_ost_pool_remove(&pool->pool_obds, ost->ltd_index); pool->pool_rr.lqr_dirty = 1; diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 704d8bb..1892dee 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -71,7 +71,8 @@ * * \param[in] env execution environment for this thread * \param[in] d LOD device - * \param[in] index index of OST target to check + * \param[in] ltd target table + * \param[in] index target index * \param[out] sfs buffer for statfs data * * \retval 0 if the target is good @@ -79,17 +80,19 @@ */ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, - int index, struct obd_statfs *sfs) + struct lu_tgt_descs *ltd, int index, + struct obd_statfs *sfs) { - struct lod_tgt_desc *ost; - int rc; + struct lov_desc *desc = <d->ltd_lov_desc; + struct lu_tgt_desc *tgt = LTD_TGT(ltd, index); + int rc; + ENTRY; LASSERT(d); - ost = OST_TGT(d,index); - LASSERT(ost); + LASSERT(tgt); - rc = dt_statfs(env, ost->ltd_ost, sfs); + rc = dt_statfs(env, tgt->ltd_tgt, sfs); if (rc == 0 && ((sfs->os_state & OS_STATE_ENOSPC) || (sfs->os_state & OS_STATE_ENOINO && sfs->os_fprecreated == 0))) @@ -107,36 +110,36 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, rc = -ENOBUFS; /* check whether device has changed state (active, inactive) */ - if (rc != 0 && ost->ltd_active) { + if (rc != 0 && tgt->ltd_active) { /* turned inactive? */ spin_lock(&d->lod_lock); - if (ost->ltd_active) { - ost->ltd_active = 0; + if (tgt->ltd_active) { + tgt->ltd_active = 0; if (rc == -ENOTCONN) - ost->ltd_connecting = 1; + tgt->ltd_connecting = 1; - LASSERT(d->lod_desc.ld_active_tgt_count > 0); - d->lod_desc.ld_active_tgt_count--; - d->lod_qos.lq_dirty = 1; - d->lod_qos.lq_rr.lqr_dirty = 1; + LASSERT(desc->ld_active_tgt_count > 0); + desc->ld_active_tgt_count--; + ltd->ltd_qos.lq_dirty = 1; + ltd->ltd_qos.lq_rr.lqr_dirty = 1; CDEBUG(D_CONFIG, "%s: turns inactive\n", - ost->ltd_exp->exp_obd->obd_name); + tgt->ltd_exp->exp_obd->obd_name); } spin_unlock(&d->lod_lock); - } else if (rc == 0 && ost->ltd_active == 0) { + } else if (rc == 0 && tgt->ltd_active == 0) { /* turned active? */ - LASSERTF(d->lod_desc.ld_active_tgt_count < d->lod_ostnr, - "active tgt count %d, ost nr %d\n", - d->lod_desc.ld_active_tgt_count, d->lod_ostnr); + LASSERTF(desc->ld_active_tgt_count < desc->ld_tgt_count, + "active tgt count %d, tgt nr %d\n", + desc->ld_active_tgt_count, desc->ld_tgt_count); spin_lock(&d->lod_lock); - if (ost->ltd_active == 0) { - ost->ltd_active = 1; - ost->ltd_connecting = 0; - d->lod_desc.ld_active_tgt_count++; - d->lod_qos.lq_dirty = 1; - d->lod_qos.lq_rr.lqr_dirty = 1; + if (tgt->ltd_active == 0) { + tgt->ltd_active = 1; + tgt->ltd_connecting = 0; + desc->ld_active_tgt_count++; + ltd->ltd_qos.lq_dirty = 1; + ltd->ltd_qos.lq_rr.lqr_dirty = 1; CDEBUG(D_CONFIG, "%s: turns active\n", - ost->ltd_exp->exp_obd->obd_name); + tgt->ltd_exp->exp_obd->obd_name); } spin_unlock(&d->lod_lock); } @@ -157,20 +160,21 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod) { struct obd_device *obd = lod2obd(lod); - struct ost_pool *osts = &(lod->lod_pool_info); + struct lu_tgt_pool *osts = &lod->lod_ost_descs.ltd_tgt_pool; time64_t max_age; unsigned int i; u64 avail; int idx; ENTRY; - max_age = ktime_get_seconds() - 2 * lod->lod_desc.ld_qos_maxage; + max_age = ktime_get_seconds() - + 2 * lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage; if (obd->obd_osfs_age > max_age) /* statfs data are quite recent, don't need to refresh it */ RETURN_EXIT; - down_write(&lod->lod_qos.lq_rw_sem); + down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); if (obd->obd_osfs_age > max_age) goto out; @@ -178,17 +182,17 @@ void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod) for (i = 0; i < osts->op_count; i++) { idx = osts->op_array[i]; avail = OST_TGT(lod,idx)->ltd_statfs.os_bavail; - if (lod_statfs_and_check(env, lod, idx, + if (lod_statfs_and_check(env, lod, &lod->lod_ost_descs, idx, &OST_TGT(lod, idx)->ltd_statfs)) continue; if (OST_TGT(lod,idx)->ltd_statfs.os_bavail != avail) /* recalculate weigths */ - lod->lod_qos.lq_dirty = 1; + lod->lod_ost_descs.ltd_qos.lq_dirty = 1; } obd->obd_osfs_age = ktime_get_seconds(); out: - up_write(&lod->lod_qos.lq_rw_sem); + up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); EXIT; } @@ -210,7 +214,7 @@ out: * \retval 0 on success * \retval -ENOMEM fails to allocate the array */ -static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, +static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool, struct lu_qos_rr *lqr) { struct lu_svr_qos *oss; @@ -226,7 +230,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, } /* Do actual allocation. */ - down_write(&lod->lod_qos.lq_rw_sem); + down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); /* * Check again. While we were sleeping on @lq_rw_sem something could @@ -234,7 +238,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, */ if (!lqr->lqr_dirty) { LASSERT(lqr->lqr_pool.op_size); - up_write(&lod->lod_qos.lq_rw_sem); + up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); RETURN(0); } @@ -247,7 +251,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, lqr->lqr_pool.op_count = real_count; rc = lod_ost_pool_extend(&lqr->lqr_pool, real_count); if (rc) { - up_write(&lod->lod_qos.lq_rw_sem); + up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); RETURN(rc); } for (i = 0; i < lqr->lqr_pool.op_count; i++) @@ -255,7 +259,8 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, /* Place all the OSTs from 1 OSS at the same time. */ placed = 0; - list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) { + list_for_each_entry(oss, &lod->lod_ost_descs.ltd_qos.lq_svr_list, + lsq_svr_list) { int j = 0; for (i = 0; i < lqr->lqr_pool.op_count; i++) { @@ -266,7 +271,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, continue; ost = OST_TGT(lod,src_pool->op_array[i]); - LASSERT(ost && ost->ltd_ost); + LASSERT(ost && ost->ltd_tgt); if (ost->ltd_qos.ltq_svr != oss) continue; @@ -282,7 +287,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, } lqr->lqr_dirty = 0; - up_write(&lod->lod_qos.lq_rw_sem); + up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); if (placed != real_count) { /* This should never happen */ @@ -338,12 +343,12 @@ static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env, ENTRY; LASSERT(d); - LASSERT(ost_idx < d->lod_osts_size); + LASSERT(ost_idx < d->lod_ost_descs.ltd_tgts_size); ost = OST_TGT(d,ost_idx); LASSERT(ost); - LASSERT(ost->ltd_ost); + LASSERT(ost->ltd_tgt); - nd = &ost->ltd_ost->dd_lu_dev; + nd = &ost->ltd_tgt->dd_lu_dev; /* * allocate anonymous object with zero fid, real fid @@ -589,7 +594,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, int rc; ENTRY; - rc = lod_statfs_and_check(env, lod, ost_idx, sfs); + rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost_idx, sfs); if (rc) RETURN(rc); @@ -694,9 +699,9 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct pool_desc *pool = NULL; - struct ost_pool *osts; + struct lu_tgt_pool *osts; struct lu_qos_rr *lqr; - unsigned int i, array_idx; + unsigned int i, array_idx; __u32 ost_start_idx_temp; __u32 stripe_idx = 0; __u32 stripe_count, stripe_count_min, ost_idx; @@ -718,8 +723,8 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, osts = &(pool->pool_obds); lqr = &(pool->pool_rr); } else { - osts = &(m->lod_pool_info); - lqr = &(m->lod_qos.lq_rr); + osts = &m->lod_ost_descs.ltd_tgt_pool; + lqr = &(m->lod_ost_descs.ltd_qos.lq_rr); } rc = lod_qos_calc_rr(m, osts, lqr); @@ -730,7 +735,7 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, if (rc) GOTO(out, rc); - down_read(&m->lod_qos.lq_rw_sem); + down_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem); spin_lock(&lqr->lqr_alloc); if (--lqr->lqr_start_count <= 0) { lqr->lqr_start_idx = prandom_u32_max(osts->op_count); @@ -799,7 +804,7 @@ repeat_find: } spin_unlock(&lqr->lqr_alloc); - up_read(&m->lod_qos.lq_rw_sem); + up_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem); /* If there are enough OSTs, a component with overstriping requested * will not actually end up overstriped. The comp should reflect this. @@ -914,7 +919,8 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, break; } - rc = lod_statfs_and_check(env, m, ost_idx, sfs); + rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, ost_idx, + sfs); if (rc < 0) /* this OSP doesn't feel well */ break; @@ -977,8 +983,8 @@ static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo, unsigned int i, array_idx, ost_count; int rc, stripe_num = 0; int speed = 0; - struct pool_desc *pool = NULL; - struct ost_pool *osts; + struct pool_desc *pool = NULL; + struct lu_tgt_pool *osts; int stripes_per_ost = 1; bool overstriped = false; ENTRY; @@ -997,7 +1003,7 @@ static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo, down_read(&pool_tgt_rw_sem(pool)); osts = &(pool->pool_obds); } else { - osts = &(m->lod_pool_info); + osts = &m->lod_ost_descs.ltd_tgt_pool; } ost_count = osts->op_count; @@ -1058,7 +1064,8 @@ repeat_find: * start OST, then it can be skipped, otherwise skip it only * if it is inactive/recovering/out-of-space." */ - rc = lod_statfs_and_check(env, m, ost_idx, sfs); + rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, ost_idx, + sfs); if (rc) { /* this OSP doesn't feel well */ continue; @@ -1169,7 +1176,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, struct dt_object *o; __u64 total_weight = 0; struct pool_desc *pool = NULL; - struct ost_pool *osts; + struct lu_tgt_pool *osts; unsigned int i; __u32 nfound, good_osts, stripe_count, stripe_count_min; bool overstriped = false; @@ -1191,11 +1198,11 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, down_read(&pool_tgt_rw_sem(pool)); osts = &(pool->pool_obds); } else { - osts = &(lod->lod_pool_info); + osts = &lod->lod_ost_descs.ltd_tgt_pool; } /* Detect -EAGAIN early, before expensive lock is taken. */ - if (!lqos_is_usable(&lod->lod_qos, lod->lod_desc.ld_active_tgt_count)) + if (!ltd_qos_is_usable(&lod->lod_ost_descs)) GOTO(out_nolock, rc = -EAGAIN); if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) @@ -1203,18 +1210,16 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, (lod_comp->llc_stripe_count - 1)/osts->op_count + 1; /* Do actual allocation, use write lock here. */ - down_write(&lod->lod_qos.lq_rw_sem); + down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); /* * Check again, while we were sleeping on @lq_rw_sem things could * change. */ - if (!lqos_is_usable(&lod->lod_qos, lod->lod_desc.ld_active_tgt_count)) + if (!ltd_qos_is_usable(&lod->lod_ost_descs)) GOTO(out, rc = -EAGAIN); - rc = lqos_calc_penalties(&lod->lod_qos, &lod->lod_ost_descs, - lod->lod_desc.ld_active_tgt_count, - lod->lod_desc.ld_qos_maxage, false); + rc = ltd_qos_penalties_calc(&lod->lod_ost_descs); if (rc) GOTO(out, rc); @@ -1231,7 +1236,8 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, ost = OST_TGT(lod, osts->op_array[i]); ost->ltd_qos.ltq_usable = 0; - rc = lod_statfs_and_check(env, lod, osts->op_array[i], sfs); + rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, + osts->op_array[i], sfs); if (rc) { /* this OSP doesn't feel well */ continue; @@ -1247,7 +1253,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, continue; ost->ltd_qos.ltq_usable = 1; - lqos_calc_weight(ost); + lu_tgt_qos_weight_calc(ost); total_weight += ost->ltd_qos.ltq_weight; good_osts++; @@ -1324,10 +1330,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, lod_qos_ost_in_use(env, nfound, idx); stripe[nfound] = o; ost_indices[nfound] = idx; - lqos_recalc_weight(&lod->lod_qos, &lod->lod_ost_descs, - ost, - lod->lod_desc.ld_active_tgt_count, - &total_weight); + ltd_qos_update(&lod->lod_ost_descs, ost, &total_weight); nfound++; rc = 0; break; @@ -1356,8 +1359,8 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, } /* makes sense to rebalance next time */ - lod->lod_qos.lq_dirty = 1; - lod->lod_qos.lq_same_space = 0; + lod->lod_ost_descs.ltd_qos.lq_dirty = 1; + lod->lod_ost_descs.ltd_qos.lq_same_space = 0; rc = -EAGAIN; } @@ -1369,7 +1372,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, lod_comp->llc_pattern &= ~LOV_PATTERN_OVERSTRIPING; out: - up_write(&lod->lod_qos.lq_rw_sem); + up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); out_nolock: if (pool != NULL) { @@ -1408,12 +1411,16 @@ __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo, if (!stripe_count) - stripe_count = lod->lod_desc.ld_default_stripe_count; + stripe_count = + lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_count; if (!stripe_count) stripe_count = 1; /* Overstriping allows more stripes than targets */ - if (stripe_count > lod->lod_desc.ld_active_tgt_count && !overstriping) - stripe_count = lod->lod_desc.ld_active_tgt_count; + if (stripe_count > + lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count && + !overstriping) + stripe_count = + lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count; if (lo->ldo_is_composite) { struct lod_layout_component *lod_comp; @@ -1629,7 +1636,7 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo, { struct lod_layout_component *lod_comp; struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev); - struct lov_desc *desc = &d->lod_desc; + struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc; struct lov_user_md_v1 *v1 = NULL; struct lov_user_md_v3 *v3 = NULL; struct lov_comp_md_v1 *comp_v1 = NULL; @@ -1845,16 +1852,16 @@ free_comp: int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo) { struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct lod_tgt_descs *ltds = &lod->lod_ost_descs; struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid; struct cfs_bitmap *bitmap = NULL; __u32 *new_oss = NULL; - lag->lag_ost_avail = ltds->ltd_tgtnr; + lag->lag_ost_avail = lod->lod_ost_count; /* reset OSS avoid guide array */ lag->lag_oaa_count = 0; - if (lag->lag_oss_avoid_array && lag->lag_oaa_size < ltds->ltd_tgtnr) { + if (lag->lag_oss_avoid_array && + lag->lag_oaa_size < lod->lod_ost_count) { OBD_FREE(lag->lag_oss_avoid_array, sizeof(__u32) * lag->lag_oaa_size); lag->lag_oss_avoid_array = NULL; @@ -1863,7 +1870,7 @@ int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo) /* init OST avoid guide bitmap */ if (lag->lag_ost_avoid_bitmap) { - if (ltds->ltd_tgtnr <= lag->lag_ost_avoid_bitmap->size) { + if (lod->lod_ost_count <= lag->lag_ost_avoid_bitmap->size) { CFS_RESET_BITMAP(lag->lag_ost_avoid_bitmap); } else { CFS_FREE_BITMAP(lag->lag_ost_avoid_bitmap); @@ -1872,7 +1879,7 @@ int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo) } if (!lag->lag_ost_avoid_bitmap) { - bitmap = CFS_ALLOCATE_BITMAP(ltds->ltd_tgtnr); + bitmap = CFS_ALLOCATE_BITMAP(lod->lod_ost_count); if (!bitmap) return -ENOMEM; } @@ -1884,7 +1891,7 @@ int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo) * using OST count to allocate the array to store the OSS * id. */ - OBD_ALLOC(new_oss, sizeof(*new_oss) * ltds->ltd_tgtnr); + OBD_ALLOC(new_oss, sizeof(*new_oss) * lod->lod_ost_count); if (!new_oss) { CFS_FREE_BITMAP(bitmap); return -ENOMEM; @@ -1893,7 +1900,7 @@ int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo) if (new_oss) { lag->lag_oss_avoid_array = new_oss; - lag->lag_oaa_size = ltds->ltd_tgtnr; + lag->lag_oaa_size = lod->lod_ost_count; } if (bitmap) lag->lag_ost_avoid_bitmap = bitmap; @@ -2053,7 +2060,7 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, lod_getref(&d->lod_ost_descs); /* XXX: support for non-0 files w/o objects */ CDEBUG(D_OTHER, "tgt_count %d stripe_count %d\n", - d->lod_desc.ld_tgt_count, stripe_len); + d->lod_ost_count, stripe_len); if (lod_comp->llc_ostlist.op_array && lod_comp->llc_ostlist.op_count) { @@ -2147,7 +2154,7 @@ int lod_prepare_create(const struct lu_env *env, struct lod_object *lo, /* no OST available */ /* XXX: should we be waiting a bit to prevent failures during * cluster initialization? */ - if (d->lod_ostnr == 0) + if (!d->lod_ost_count) RETURN(-EIO); /* diff --git a/lustre/lod/lproc_lod.c b/lustre/lod/lproc_lod.c index cb2c780..cccc6aa 100644 --- a/lustre/lod/lproc_lod.c +++ b/lustre/lod/lproc_lod.c @@ -136,7 +136,7 @@ static int lod_stripesize_seq_show(struct seq_file *m, void *v) LASSERT(dev != NULL); lod = lu2lod_dev(dev->obd_lu_dev); seq_printf(m, "%llu\n", - lod->lod_desc.ld_default_stripe_size); + lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_size); return 0; } @@ -172,7 +172,7 @@ lod_stripesize_seq_write(struct file *file, const char __user *buffer, return -ERANGE; lod_fix_desc_stripe_size(&val); - lod->lod_desc.ld_default_stripe_size = val; + lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_size = val; return count; } @@ -194,7 +194,8 @@ static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%lld\n", lod->lod_desc.ld_default_stripe_offset); + return sprintf(buf, "%lld\n", + lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_offset); } /** @@ -227,7 +228,7 @@ static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr, if (val < -1 || val > LOV_MAX_STRIPE_COUNT) return -ERANGE; - lod->lod_desc.ld_default_stripe_offset = val; + lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_offset = val; return count; } @@ -249,7 +250,7 @@ static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%u\n", lod->lod_desc.ld_pattern); + return sprintf(buf, "%u\n", lod->lod_ost_descs.ltd_lov_desc.ld_pattern); } /** @@ -278,7 +279,7 @@ static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr, return rc; lod_fix_desc_pattern(&pattern); - lod->lod_desc.ld_pattern = pattern; + lod->lod_ost_descs.ltd_lov_desc.ld_pattern = pattern; return count; } @@ -299,9 +300,10 @@ static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr, struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lov_desc *desc = &lod->lod_ost_descs.ltd_lov_desc; return sprintf(buf, "%d\n", - (s16)(lod->lod_desc.ld_default_stripe_count + 1) - 1); + (s16)(desc->ld_default_stripe_count + 1) - 1); } /** @@ -333,7 +335,7 @@ static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr, return -ERANGE; lod_fix_desc_stripe_count(&stripe_count); - lod->lod_desc.ld_default_stripe_count = stripe_count; + lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_count = stripe_count; return count; } @@ -355,7 +357,7 @@ static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%u\n", lod->lod_desc.ld_tgt_count); + return sprintf(buf, "%u\n", lod->lod_ost_count); } LUSTRE_RO_ATTR(numobd); @@ -375,7 +377,8 @@ static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%u\n", lod->lod_desc.ld_active_tgt_count); + return sprintf(buf, "%u\n", + lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count); } LUSTRE_RO_ATTR(activeobd); @@ -395,7 +398,8 @@ static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%s\n", lod->lod_desc.ld_uuid.uuid); + return sprintf(buf, "%s\n", + lod->lod_ost_descs.ltd_lov_desc.ld_uuid.uuid); } LUSTRE_RO_ATTR(desc_uuid); @@ -421,7 +425,8 @@ static ssize_t qos_prio_free_show(struct kobject *kobj, struct attribute *attr, struct lod_device *lod = dt2lod_dev(dt); return sprintf(buf, "%d%%\n", - (lod->lod_qos.lq_prio_free * 100 + 255) >> 8); + (lod->lod_ost_descs.ltd_qos.lq_prio_free * 100 + 255) >> + 8); } /** @@ -455,9 +460,9 @@ static ssize_t qos_prio_free_store(struct kobject *kobj, struct attribute *attr, if (val > 100) return -EINVAL; - lod->lod_qos.lq_prio_free = (val << 8) / 100; - lod->lod_qos.lq_dirty = 1; - lod->lod_qos.lq_reset = 1; + lod->lod_ost_descs.ltd_qos.lq_prio_free = (val << 8) / 100; + lod->lod_ost_descs.ltd_qos.lq_dirty = 1; + lod->lod_ost_descs.ltd_qos.lq_reset = 1; return count; } @@ -480,7 +485,8 @@ static int lod_qos_thresholdrr_seq_show(struct seq_file *m, void *v) LASSERT(dev != NULL); lod = lu2lod_dev(dev->obd_lu_dev); seq_printf(m, "%d%%\n", - (lod->lod_qos.lq_threshold_rr * 100 + 255) >> 8); + (lod->lod_ost_descs.ltd_qos.lq_threshold_rr * 100 + 255) >> + 8); return 0; } @@ -521,8 +527,8 @@ lod_qos_thresholdrr_seq_write(struct file *file, const char __user *buffer, if (val > 100 || val < 0) return -EINVAL; - lod->lod_qos.lq_threshold_rr = (val << 8) / 100; - lod->lod_qos.lq_dirty = 1; + lod->lod_ost_descs.ltd_qos.lq_threshold_rr = (val << 8) / 100; + lod->lod_ost_descs.ltd_qos.lq_dirty = 1; return count; } @@ -545,7 +551,8 @@ static ssize_t qos_maxage_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%u Sec\n", lod->lod_desc.ld_qos_maxage); + return sprintf(buf, "%u Sec\n", + lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage); } /** @@ -569,7 +576,7 @@ static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr, struct lu_device *next; struct lustre_cfg *lcfg; char str[32]; - unsigned int i; + struct lu_tgt_desc *tgt; int rc; u32 val; @@ -579,7 +586,7 @@ static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr, if (val <= 0) return -EINVAL; - lod->lod_desc.ld_qos_maxage = val; + lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage = val; /* * propogate the value down to OSPs @@ -593,11 +600,12 @@ static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr, lustre_cfg_init(lcfg, LCFG_PARAM, &bufs); lod_getref(&lod->lod_ost_descs); - lod_foreach_ost(lod, i) { - next = &OST_TGT(lod,i)->ltd_ost->dd_lu_dev; + lod_foreach_ost(lod, tgt) { + next = &tgt->ltd_tgt->dd_lu_dev; rc = next->ld_ops->ldo_process_config(NULL, next, lcfg); if (rc) - CERROR("can't set maxage on #%d: %d\n", i, rc); + CERROR("can't set maxage on #%d: %d\n", + tgt->ltd_index, rc); } lod_putref(lod, &lod->lod_ost_descs); OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); @@ -664,7 +672,7 @@ static void *lod_osts_seq_next(struct seq_file *p, void *v, loff_t *pos) static int lod_osts_seq_show(struct seq_file *p, void *v) { struct obd_device *obd = p->private; - struct lod_ost_desc *ost_desc = v; + struct lu_tgt_desc *ost_desc = v; struct lod_device *lod; int idx, rc, active; struct dt_device *next; @@ -674,7 +682,7 @@ static int lod_osts_seq_show(struct seq_file *p, void *v) lod = lu2lod_dev(obd->obd_lu_dev); idx = ost_desc->ltd_index; - next = OST_TGT(lod,idx)->ltd_ost; + next = OST_TGT(lod, idx)->ltd_tgt; if (next == NULL) return -EINVAL; diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 33aa806..2307d10 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -234,7 +234,7 @@ void lsm_free(struct lov_stripe_md *lsm); struct pool_desc { char pool_name[LOV_MAXPOOLNAME + 1]; - struct ost_pool pool_obds; + struct lu_tgt_pool pool_obds; atomic_t pool_refcount; struct hlist_node pool_hash; /* access by poolname */ struct list_head pool_list; /* serial access */ @@ -333,12 +333,12 @@ extern struct lu_device_type lov_device_type; /* pools */ extern struct cfs_hash_ops pool_hash_operations; -/* ost_pool methods */ -int lov_ost_pool_init(struct ost_pool *op, unsigned int count); -int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count); -int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count); -int lov_ost_pool_remove(struct ost_pool *op, __u32 idx); -int lov_ost_pool_free(struct ost_pool *op); +/* lu_tgt_pool methods */ +int lov_ost_pool_init(struct lu_tgt_pool *op, unsigned int count); +int lov_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count); +int lov_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count); +int lov_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx); +int lov_ost_pool_free(struct lu_tgt_pool *op); /* high level pool methods */ int lov_pool_new(struct obd_device *obd, char *poolname); diff --git a/lustre/lov/lov_pool.c b/lustre/lov/lov_pool.c index 466c394..be35bd7 100644 --- a/lustre/lov/lov_pool.c +++ b/lustre/lov/lov_pool.c @@ -313,7 +313,7 @@ void lov_dump_pool(int level, struct pool_desc *pool) } #define LOV_POOL_INIT_COUNT 2 -int lov_ost_pool_init(struct ost_pool *op, unsigned int count) +int lov_ost_pool_init(struct lu_tgt_pool *op, unsigned int count) { ENTRY; @@ -333,7 +333,7 @@ int lov_ost_pool_init(struct ost_pool *op, unsigned int count) } /* Caller must hold write op_rwlock */ -int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count) +int lov_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count) { __u32 *new; __u32 new_size; @@ -357,7 +357,7 @@ int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count) return 0; } -int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count) +int lov_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count) { int rc = 0, i; ENTRY; @@ -382,7 +382,7 @@ out: return rc; } -int lov_ost_pool_remove(struct ost_pool *op, __u32 idx) +int lov_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx) { int i; ENTRY; @@ -404,7 +404,7 @@ int lov_ost_pool_remove(struct ost_pool *op, __u32 idx) RETURN(-EINVAL); } -int lov_ost_pool_free(struct ost_pool *op) +int lov_ost_pool_free(struct lu_tgt_pool *op) { ENTRY; diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in index 128ecda..f1925e9 100644 --- a/lustre/obdclass/Makefile.in +++ b/lustre/obdclass/Makefile.in @@ -12,7 +12,6 @@ obdclass-all-objs += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o obdclass-all-objs += linkea.o obdclass-all-objs += kernelcomm.o jobid.o obdclass-all-objs += integrity.o obd_cksum.o -obdclass-all-objs += lu_qos.o obdclass-all-objs += lu_tgt_descs.o @SERVER_TRUE@obdclass-all-objs += acl.o diff --git a/lustre/obdclass/lu_qos.c b/lustre/obdclass/lu_qos.c deleted file mode 100644 index 5af09a83..0000000 --- a/lustre/obdclass/lu_qos.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * - * lustre/obdclass/lu_qos.c - * - * Lustre QoS. - * These are the only exported functions, they provide some generic - * infrastructure for object allocation QoS - * - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include /* hash_long() */ -#include -#include -#include -#include -#include -#include - -void lu_qos_rr_init(struct lu_qos_rr *lqr) -{ - spin_lock_init(&lqr->lqr_alloc); - lqr->lqr_dirty = 1; -} -EXPORT_SYMBOL(lu_qos_rr_init); - -/** - * Add a new target to Quality of Service (QoS) target table. - * - * Add a new MDT/OST target to the structure representing an OSS. Resort the - * list of known MDSs/OSSs by the number of MDTs/OSTs attached to each MDS/OSS. - * The MDS/OSS list is protected internally and no external locking is required. - * - * \param[in] qos lu_qos data - * \param[in] ltd target description - * - * \retval 0 on success - * \retval -ENOMEM on error - */ -int lqos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd) -{ - struct lu_svr_qos *svr = NULL; - struct lu_svr_qos *tempsvr; - struct obd_export *exp = ltd->ltd_exp; - int found = 0; - __u32 id = 0; - int rc = 0; - - ENTRY; - - down_write(&qos->lq_rw_sem); - /* - * a bit hacky approach to learn NID of corresponding connection - * but there is no official API to access information like this - * with OSD API. - */ - list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { - if (obd_uuid_equals(&svr->lsq_uuid, - &exp->exp_connection->c_remote_uuid)) { - found++; - break; - } - if (svr->lsq_id > id) - id = svr->lsq_id; - } - - if (!found) { - OBD_ALLOC_PTR(svr); - if (!svr) - GOTO(out, rc = -ENOMEM); - memcpy(&svr->lsq_uuid, &exp->exp_connection->c_remote_uuid, - sizeof(svr->lsq_uuid)); - ++id; - svr->lsq_id = id; - } else { - /* Assume we have to move this one */ - list_del(&svr->lsq_svr_list); - } - - svr->lsq_tgt_count++; - ltd->ltd_qos.ltq_svr = svr; - - CDEBUG(D_OTHER, "add tgt %s to server %s (%d targets)\n", - obd_uuid2str(<d->ltd_uuid), obd_uuid2str(&svr->lsq_uuid), - svr->lsq_tgt_count); - - /* - * Add sorted by # of tgts. Find the first entry that we're - * bigger than... - */ - list_for_each_entry(tempsvr, &qos->lq_svr_list, lsq_svr_list) { - if (svr->lsq_tgt_count > tempsvr->lsq_tgt_count) - break; - } - /* - * ...and add before it. If we're the first or smallest, tempsvr - * points to the list head, and we add to the end. - */ - list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list); - - qos->lq_dirty = 1; - qos->lq_rr.lqr_dirty = 1; - -out: - up_write(&qos->lq_rw_sem); - RETURN(rc); -} -EXPORT_SYMBOL(lqos_add_tgt); - -/** - * Remove MDT/OST target from QoS table. - * - * Removes given MDT/OST target from QoS table and releases related - * MDS/OSS structure if no target remain on the MDS/OSS. - * - * \param[in] qos lu_qos data - * \param[in] ltd target description - * - * \retval 0 on success - * \retval -ENOENT if no server was found - */ -int lqos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd) -{ - struct lu_svr_qos *svr; - int rc = 0; - - ENTRY; - - down_write(&qos->lq_rw_sem); - svr = ltd->ltd_qos.ltq_svr; - if (!svr) - GOTO(out, rc = -ENOENT); - - svr->lsq_tgt_count--; - if (svr->lsq_tgt_count == 0) { - CDEBUG(D_OTHER, "removing server %s\n", - obd_uuid2str(&svr->lsq_uuid)); - list_del(&svr->lsq_svr_list); - ltd->ltd_qos.ltq_svr = NULL; - OBD_FREE_PTR(svr); - } - - qos->lq_dirty = 1; - qos->lq_rr.lqr_dirty = 1; -out: - up_write(&qos->lq_rw_sem); - RETURN(rc); -} -EXPORT_SYMBOL(lqos_del_tgt); - -/** - * lu_prandom_u64_max - returns a pseudo-random u64 number in interval - * [0, ep_ro) - * - * \param[in] ep_ro right open interval endpoint - * - * \retval a pseudo-random 64-bit number that is in interval [0, ep_ro). - */ -u64 lu_prandom_u64_max(u64 ep_ro) -{ - u64 rand = 0; - - if (ep_ro) { -#if BITS_PER_LONG == 32 - /* - * If ep_ro > 32-bit, first generate the high - * 32 bits of the random number, then add in the low - * 32 bits (truncated to the upper limit, if needed) - */ - if (ep_ro > 0xffffffffULL) - rand = prandom_u32_max((u32)(ep_ro >> 32)) << 32; - - if (rand == (ep_ro & 0xffffffff00000000ULL)) - rand |= prandom_u32_max((u32)ep_ro); - else - rand |= prandom_u32(); -#else - rand = ((u64)prandom_u32() << 32 | prandom_u32()) % ep_ro; -#endif - } - - return rand; -} -EXPORT_SYMBOL(lu_prandom_u64_max); - -static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) -{ - struct obd_statfs *statfs = &tgt->ltd_statfs; - - return statfs->os_bavail * statfs->os_bsize; -} - -static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt) -{ - return tgt->ltd_statfs.os_ffree; -} - -/** - * Calculate penalties per-tgt and per-server - * - * Re-calculate penalties when the configuration changes, active targets - * change and after statfs refresh (all these are reflected by lq_dirty flag). - * On every tgt and server: decay the penalty by half for every 8x the update - * interval that the device has been idle. That gives lots of time for the - * statfs information to be updated (which the penalty is only a proxy for), - * and avoids penalizing server/tgt under light load. - * See lqos_calc_weight() for how penalties are factored into the weight. - * - * \param[in] qos lu_qos - * \param[in] ltd lu_tgt_descs - * \param[in] active_tgt_nr active tgt number - * \param[in] maxage qos max age - * \param[in] is_mdt MDT will count inode usage - * - * \retval 0 on success - * \retval -EAGAIN the number of tgt isn't enough or all tgt spaces are - * almost the same - */ -int lqos_calc_penalties(struct lu_qos *qos, struct lu_tgt_descs *ltd, - __u32 active_tgt_nr, __u32 maxage, bool is_mdt) -{ - struct lu_tgt_desc *tgt; - struct lu_svr_qos *svr; - __u64 ba_max, ba_min, ba; - __u64 ia_max, ia_min, ia = 1; - __u32 num_active; - int prio_wide; - time64_t now, age; - int rc; - - ENTRY; - - if (!qos->lq_dirty) - GOTO(out, rc = 0); - - num_active = active_tgt_nr - 1; - if (num_active < 1) - GOTO(out, rc = -EAGAIN); - - /* find bavail on each server */ - list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { - svr->lsq_bavail = 0; - /* if inode is not counted, set to 1 to ignore */ - svr->lsq_iavail = is_mdt ? 0 : 1; - } - qos->lq_active_svr_count = 0; - - /* - * How badly user wants to select targets "widely" (not recently chosen - * and not on recent MDS's). As opposed to "freely" (free space avail.) - * 0-256 - */ - prio_wide = 256 - qos->lq_prio_free; - - ba_min = (__u64)(-1); - ba_max = 0; - ia_min = (__u64)(-1); - ia_max = 0; - now = ktime_get_real_seconds(); - - /* Calculate server penalty per object */ - ltd_foreach_tgt(ltd, tgt) { - if (!tgt->ltd_active) - continue; - - /* when inode is counted, bavail >> 16 to avoid overflow */ - ba = tgt_statfs_bavail(tgt); - if (is_mdt) - ba >>= 16; - else - ba >>= 8; - if (!ba) - continue; - - ba_min = min(ba, ba_min); - ba_max = max(ba, ba_max); - - /* Count the number of usable servers */ - if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0) - qos->lq_active_svr_count++; - tgt->ltd_qos.ltq_svr->lsq_bavail += ba; - - if (is_mdt) { - /* iavail >> 8 to avoid overflow */ - ia = tgt_statfs_iavail(tgt) >> 8; - if (!ia) - continue; - - ia_min = min(ia, ia_min); - ia_max = max(ia, ia_max); - - tgt->ltd_qos.ltq_svr->lsq_iavail += ia; - } - - /* - * per-tgt penalty is - * prio * bavail * iavail / (num_tgt - 1) / 2 - */ - tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8; - do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active); - tgt->ltd_qos.ltq_penalty_per_obj >>= 1; - - age = (now - tgt->ltd_qos.ltq_used) >> 3; - if (qos->lq_reset || age > 32 * maxage) - tgt->ltd_qos.ltq_penalty = 0; - else if (age > maxage) - /* Decay tgt penalty. */ - tgt->ltd_qos.ltq_penalty >>= (age / maxage); - } - - num_active = qos->lq_active_svr_count - 1; - if (num_active < 1) { - /* - * If there's only 1 server, we can't penalize it, so instead - * we have to double the tgt penalty - */ - num_active = 1; - ltd_foreach_tgt(ltd, tgt) { - if (!tgt->ltd_active) - continue; - - tgt->ltd_qos.ltq_penalty_per_obj <<= 1; - } - } - - /* - * Per-server penalty is - * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2 - */ - list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { - ba = svr->lsq_bavail; - ia = svr->lsq_iavail; - svr->lsq_penalty_per_obj = prio_wide * ba * ia >> 8; - do_div(ba, svr->lsq_tgt_count * num_active); - svr->lsq_penalty_per_obj >>= 1; - - age = (now - svr->lsq_used) >> 3; - if (qos->lq_reset || age > 32 * maxage) - svr->lsq_penalty = 0; - else if (age > maxage) - /* Decay server penalty. */ - svr->lsq_penalty >>= age / maxage; - } - - qos->lq_dirty = 0; - qos->lq_reset = 0; - - /* - * If each tgt has almost same free space, do rr allocation for better - * creation performance - */ - qos->lq_same_space = 0; - if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min && - (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) { - qos->lq_same_space = 1; - /* Reset weights for the next time we enter qos mode */ - qos->lq_reset = 1; - } - rc = 0; - -out: - if (!rc && qos->lq_same_space) - RETURN(-EAGAIN); - - RETURN(rc); -} -EXPORT_SYMBOL(lqos_calc_penalties); - -bool lqos_is_usable(struct lu_qos *qos, __u32 active_tgt_nr) -{ - if (!qos->lq_dirty && qos->lq_same_space) - return false; - - if (active_tgt_nr < 2) - return false; - - return true; -} -EXPORT_SYMBOL(lqos_is_usable); - -/** - * Calculate weight for a given tgt. - * - * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server - * penalties. See lqos_calc_ppts() for how penalties are calculated. - * - * \param[in] tgt target descriptor - */ -void lqos_calc_weight(struct lu_tgt_desc *tgt) -{ - struct lu_tgt_qos *ltq = &tgt->ltd_qos; - __u64 temp, temp2; - - temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8); - temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty; - if (temp < temp2) - ltq->ltq_weight = 0; - else - ltq->ltq_weight = temp - temp2; -} -EXPORT_SYMBOL(lqos_calc_weight); - -/** - * Re-calculate weights. - * - * The function is called when some target was used for a new object. In - * this case we should re-calculate all the weights to keep new allocations - * balanced well. - * - * \param[in] qos lu_qos - * \param[in] ltd lu_tgt_descs - * \param[in] tgt target where a new object was placed - * \param[in] active_tgt_nr active tgt number - * \param[out] total_wt new total weight for the pool - * - * \retval 0 - */ -int lqos_recalc_weight(struct lu_qos *qos, struct lu_tgt_descs *ltd, - struct lu_tgt_desc *tgt, __u32 active_tgt_nr, - __u64 *total_wt) -{ - struct lu_tgt_qos *ltq; - struct lu_svr_qos *svr; - - ENTRY; - - ltq = &tgt->ltd_qos; - LASSERT(ltq); - - /* Don't allocate on this device anymore, until the next alloc_qos */ - ltq->ltq_usable = 0; - - svr = ltq->ltq_svr; - - /* - * Decay old penalty by half (we're adding max penalty, and don't - * want it to run away.) - */ - ltq->ltq_penalty >>= 1; - svr->lsq_penalty >>= 1; - - /* mark the server and tgt as recently used */ - ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds(); - - /* Set max penalties for this tgt and server */ - ltq->ltq_penalty += ltq->ltq_penalty_per_obj * active_tgt_nr; - svr->lsq_penalty += svr->lsq_penalty_per_obj * active_tgt_nr; - - /* Decrease all MDS penalties */ - list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { - if (svr->lsq_penalty < svr->lsq_penalty_per_obj) - svr->lsq_penalty = 0; - else - svr->lsq_penalty -= svr->lsq_penalty_per_obj; - } - - *total_wt = 0; - /* Decrease all tgt penalties */ - ltd_foreach_tgt(ltd, tgt) { - if (!tgt->ltd_active) - continue; - - if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj) - ltq->ltq_penalty = 0; - else - ltq->ltq_penalty -= ltq->ltq_penalty_per_obj; - - lqos_calc_weight(tgt); - - /* Recalc the total weight of usable osts */ - if (ltq->ltq_usable) - *total_wt += ltq->ltq_weight; - - CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu" - " tgtppo=%llu tgtp=%llu svrppo=%llu" - " svrp=%llu wt=%llu\n", - tgt->ltd_index, ltq->ltq_usable, - tgt_statfs_bavail(tgt) >> 10, - ltq->ltq_penalty_per_obj >> 10, - ltq->ltq_penalty >> 10, - ltq->ltq_svr->lsq_penalty_per_obj >> 10, - ltq->ltq_svr->lsq_penalty >> 10, - ltq->ltq_weight >> 10); - } - - RETURN(0); -} -EXPORT_SYMBOL(lqos_recalc_weight); diff --git a/lustre/obdclass/lu_tgt_descs.c b/lustre/obdclass/lu_tgt_descs.c index cc5ab2c..c7d8bbe 100644 --- a/lustre/obdclass/lu_tgt_descs.c +++ b/lustre/obdclass/lu_tgt_descs.c @@ -34,6 +34,7 @@ #include #include +#include #include #include /* hash_long() */ #include @@ -44,17 +45,221 @@ #include /** + * lu_prandom_u64_max - returns a pseudo-random u64 number in interval + * [0, ep_ro) + * + * \param[in] ep_ro right open interval endpoint + * + * \retval a pseudo-random 64-bit number that is in interval [0, ep_ro). + */ +u64 lu_prandom_u64_max(u64 ep_ro) +{ + u64 rand = 0; + + if (ep_ro) { +#if BITS_PER_LONG == 32 + /* + * If ep_ro > 32-bit, first generate the high + * 32 bits of the random number, then add in the low + * 32 bits (truncated to the upper limit, if needed) + */ + if (ep_ro > 0xffffffffULL) + rand = prandom_u32_max((u32)(ep_ro >> 32)) << 32; + + if (rand == (ep_ro & 0xffffffff00000000ULL)) + rand |= prandom_u32_max((u32)ep_ro); + else + rand |= prandom_u32(); +#else + rand = ((u64)prandom_u32() << 32 | prandom_u32()) % ep_ro; +#endif + } + + return rand; +} +EXPORT_SYMBOL(lu_prandom_u64_max); + +void lu_qos_rr_init(struct lu_qos_rr *lqr) +{ + spin_lock_init(&lqr->lqr_alloc); + lqr->lqr_dirty = 1; +} +EXPORT_SYMBOL(lu_qos_rr_init); + +/** + * Add a new target to Quality of Service (QoS) target table. + * + * Add a new MDT/OST target to the structure representing an OSS. Resort the + * list of known MDSs/OSSs by the number of MDTs/OSTs attached to each MDS/OSS. + * The MDS/OSS list is protected internally and no external locking is required. + * + * \param[in] qos lu_qos data + * \param[in] tgt target description + * + * \retval 0 on success + * \retval -ENOMEM on error + */ +int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt) +{ + struct lu_svr_qos *svr = NULL; + struct lu_svr_qos *tempsvr; + struct obd_export *exp = tgt->ltd_exp; + int found = 0; + __u32 id = 0; + int rc = 0; + + ENTRY; + + /* tgt not connected, this function will be called again later */ + if (!exp) + RETURN(0); + + down_write(&qos->lq_rw_sem); + /* + * a bit hacky approach to learn NID of corresponding connection + * but there is no official API to access information like this + * with OSD API. + */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + if (obd_uuid_equals(&svr->lsq_uuid, + &exp->exp_connection->c_remote_uuid)) { + found++; + break; + } + if (svr->lsq_id > id) + id = svr->lsq_id; + } + + if (!found) { + OBD_ALLOC_PTR(svr); + if (!svr) + GOTO(out, rc = -ENOMEM); + memcpy(&svr->lsq_uuid, &exp->exp_connection->c_remote_uuid, + sizeof(svr->lsq_uuid)); + ++id; + svr->lsq_id = id; + } else { + /* Assume we have to move this one */ + list_del(&svr->lsq_svr_list); + } + + svr->lsq_tgt_count++; + tgt->ltd_qos.ltq_svr = svr; + + CDEBUG(D_OTHER, "add tgt %s to server %s (%d targets)\n", + obd_uuid2str(&tgt->ltd_uuid), obd_uuid2str(&svr->lsq_uuid), + svr->lsq_tgt_count); + + /* + * Add sorted by # of tgts. Find the first entry that we're + * bigger than... + */ + list_for_each_entry(tempsvr, &qos->lq_svr_list, lsq_svr_list) { + if (svr->lsq_tgt_count > tempsvr->lsq_tgt_count) + break; + } + /* + * ...and add before it. If we're the first or smallest, tempsvr + * points to the list head, and we add to the end. + */ + list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list); + + qos->lq_dirty = 1; + qos->lq_rr.lqr_dirty = 1; + +out: + up_write(&qos->lq_rw_sem); + RETURN(rc); +} +EXPORT_SYMBOL(lu_qos_add_tgt); + +/** + * Remove MDT/OST target from QoS table. + * + * Removes given MDT/OST target from QoS table and releases related + * MDS/OSS structure if no target remain on the MDS/OSS. + * + * \param[in] qos lu_qos data + * \param[in] ltd target description + * + * \retval 0 on success + * \retval -ENOENT if no server was found + */ +static int lu_qos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd) +{ + struct lu_svr_qos *svr; + int rc = 0; + + ENTRY; + + down_write(&qos->lq_rw_sem); + svr = ltd->ltd_qos.ltq_svr; + if (!svr) + GOTO(out, rc = -ENOENT); + + svr->lsq_tgt_count--; + if (svr->lsq_tgt_count == 0) { + CDEBUG(D_OTHER, "removing server %s\n", + obd_uuid2str(&svr->lsq_uuid)); + list_del(&svr->lsq_svr_list); + ltd->ltd_qos.ltq_svr = NULL; + OBD_FREE_PTR(svr); + } + + qos->lq_dirty = 1; + qos->lq_rr.lqr_dirty = 1; +out: + up_write(&qos->lq_rw_sem); + RETURN(rc); +} + +static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt) +{ + struct obd_statfs *statfs = &tgt->ltd_statfs; + + return statfs->os_bavail * statfs->os_bsize; +} + +static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt) +{ + return tgt->ltd_statfs.os_ffree; +} + +/** + * Calculate weight for a given tgt. + * + * The final tgt weight is bavail >> 16 * iavail >> 8 minus the tgt and server + * penalties. See ltd_qos_penalties_calc() for how penalties are calculated. + * + * \param[in] tgt target descriptor + */ +void lu_tgt_qos_weight_calc(struct lu_tgt_desc *tgt) +{ + struct lu_tgt_qos *ltq = &tgt->ltd_qos; + __u64 temp, temp2; + + temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8); + temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty; + if (temp < temp2) + ltq->ltq_weight = 0; + else + ltq->ltq_weight = temp - temp2; +} +EXPORT_SYMBOL(lu_tgt_qos_weight_calc); + +/** * Allocate and initialize target table. * * A helper function to initialize the target table and allocate * a bitmap of the available targets. * * \param[in] ltd target's table to initialize + * \param[in] is_mdt target table for MDTs * * \retval 0 on success * \retval negative negated errno on error **/ -int lu_tgt_descs_init(struct lu_tgt_descs *ltd) +int lu_tgt_descs_init(struct lu_tgt_descs *ltd, bool is_mdt) { mutex_init(<d->ltd_mutex); init_rwsem(<d->ltd_rw_sem); @@ -68,11 +273,22 @@ int lu_tgt_descs_init(struct lu_tgt_descs *ltd) return -ENOMEM; ltd->ltd_tgts_size = BITS_PER_LONG; - ltd->ltd_tgtnr = 0; - ltd->ltd_death_row = 0; ltd->ltd_refcount = 0; + /* Set up allocation policy (QoS and RR) */ + INIT_LIST_HEAD(<d->ltd_qos.lq_svr_list); + init_rwsem(<d->ltd_qos.lq_rw_sem); + ltd->ltd_qos.lq_dirty = 1; + ltd->ltd_qos.lq_reset = 1; + /* Default priority is toward free space balance */ + ltd->ltd_qos.lq_prio_free = 232; + /* Default threshold for rr (roughly 17%) */ + ltd->ltd_qos.lq_threshold_rr = 43; + ltd->ltd_is_mdt = is_mdt; + + lu_qos_rr_init(<d->ltd_qos.lq_rr); + return 0; } EXPORT_SYMBOL(lu_tgt_descs_init); @@ -151,7 +367,7 @@ static int lu_tgt_descs_resize(struct lu_tgt_descs *ltd, __u32 newsize) * \retval -ENOMEM if reallocation failed * -EEXIST if target existed */ -int lu_tgt_descs_add(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) +int ltd_add_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) { __u32 index = tgt->ltd_index; int rc; @@ -179,19 +395,293 @@ int lu_tgt_descs_add(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) LTD_TGT(ltd, tgt->ltd_index) = tgt; cfs_bitmap_set(ltd->ltd_tgt_bitmap, tgt->ltd_index); - ltd->ltd_tgtnr++; + + ltd->ltd_lov_desc.ld_tgt_count++; + if (tgt->ltd_active) + ltd->ltd_lov_desc.ld_active_tgt_count++; RETURN(0); } -EXPORT_SYMBOL(lu_tgt_descs_add); +EXPORT_SYMBOL(ltd_add_tgt); /** * Delete target from target table */ -void lu_tgt_descs_del(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) +void ltd_del_tgt(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) { + lu_qos_del_tgt(<d->ltd_qos, tgt); LTD_TGT(ltd, tgt->ltd_index) = NULL; cfs_bitmap_clear(ltd->ltd_tgt_bitmap, tgt->ltd_index); - ltd->ltd_tgtnr--; + ltd->ltd_lov_desc.ld_tgt_count--; + if (tgt->ltd_active) + ltd->ltd_lov_desc.ld_active_tgt_count--; +} +EXPORT_SYMBOL(ltd_del_tgt); + +/** + * Whether QoS data is up-to-date and QoS can be applied. + */ +bool ltd_qos_is_usable(struct lu_tgt_descs *ltd) +{ + if (!ltd->ltd_qos.lq_dirty && ltd->ltd_qos.lq_same_space) + return false; + + if (ltd->ltd_lov_desc.ld_active_tgt_count < 2) + return false; + + return true; +} +EXPORT_SYMBOL(ltd_qos_is_usable); + +/** + * Calculate penalties per-tgt and per-server + * + * Re-calculate penalties when the configuration changes, active targets + * change and after statfs refresh (all these are reflected by lq_dirty flag). + * On every tgt and server: decay the penalty by half for every 8x the update + * interval that the device has been idle. That gives lots of time for the + * statfs information to be updated (which the penalty is only a proxy for), + * and avoids penalizing server/tgt under light load. + * See lu_qos_tgt_weight_calc() for how penalties are factored into the weight. + * + * \param[in] ltd lu_tgt_descs + * + * \retval 0 on success + * \retval -EAGAIN the number of tgt isn't enough or all tgt spaces are + * almost the same + */ +int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) +{ + struct lu_qos *qos = <d->ltd_qos; + struct lov_desc *desc = <d->ltd_lov_desc; + struct lu_tgt_desc *tgt; + struct lu_svr_qos *svr; + __u64 ba_max, ba_min, ba; + __u64 ia_max, ia_min, ia = 1; + __u32 num_active; + int prio_wide; + time64_t now, age; + int rc; + + ENTRY; + + if (!qos->lq_dirty) + GOTO(out, rc = 0); + + num_active = desc->ld_active_tgt_count - 1; + if (num_active < 1) + GOTO(out, rc = -EAGAIN); + + /* find bavail on each server */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + svr->lsq_bavail = 0; + /* if inode is not counted, set to 1 to ignore */ + svr->lsq_iavail = ltd->ltd_is_mdt ? 0 : 1; + } + qos->lq_active_svr_count = 0; + + /* + * How badly user wants to select targets "widely" (not recently chosen + * and not on recent MDS's). As opposed to "freely" (free space avail.) + * 0-256 + */ + prio_wide = 256 - qos->lq_prio_free; + + ba_min = (__u64)(-1); + ba_max = 0; + ia_min = (__u64)(-1); + ia_max = 0; + now = ktime_get_real_seconds(); + + /* Calculate server penalty per object */ + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + /* when inode is counted, bavail >> 16 to avoid overflow */ + ba = tgt_statfs_bavail(tgt); + if (ltd->ltd_is_mdt) + ba >>= 16; + else + ba >>= 8; + if (!ba) + continue; + + ba_min = min(ba, ba_min); + ba_max = max(ba, ba_max); + + /* Count the number of usable servers */ + if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0) + qos->lq_active_svr_count++; + tgt->ltd_qos.ltq_svr->lsq_bavail += ba; + + if (ltd->ltd_is_mdt) { + /* iavail >> 8 to avoid overflow */ + ia = tgt_statfs_iavail(tgt) >> 8; + if (!ia) + continue; + + ia_min = min(ia, ia_min); + ia_max = max(ia, ia_max); + + tgt->ltd_qos.ltq_svr->lsq_iavail += ia; + } + + /* + * per-tgt penalty is + * prio * bavail * iavail / (num_tgt - 1) / 2 + */ + tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia; + do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active); + tgt->ltd_qos.ltq_penalty_per_obj >>= 1; + + age = (now - tgt->ltd_qos.ltq_used) >> 3; + if (qos->lq_reset || age > 32 * desc->ld_qos_maxage) + tgt->ltd_qos.ltq_penalty = 0; + else if (age > desc->ld_qos_maxage) + /* Decay tgt penalty. */ + tgt->ltd_qos.ltq_penalty >>= age / desc->ld_qos_maxage; + } + + num_active = qos->lq_active_svr_count - 1; + if (num_active < 1) { + /* + * If there's only 1 server, we can't penalize it, so instead + * we have to double the tgt penalty + */ + num_active = 1; + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + tgt->ltd_qos.ltq_penalty_per_obj <<= 1; + } + } + + /* + * Per-server penalty is + * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2 + */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + ba = svr->lsq_bavail; + ia = svr->lsq_iavail; + svr->lsq_penalty_per_obj = prio_wide * ba * ia; + do_div(ba, svr->lsq_tgt_count * num_active); + svr->lsq_penalty_per_obj >>= 1; + + age = (now - svr->lsq_used) >> 3; + if (qos->lq_reset || age > 32 * desc->ld_qos_maxage) + svr->lsq_penalty = 0; + else if (age > desc->ld_qos_maxage) + /* Decay server penalty. */ + svr->lsq_penalty >>= age / desc->ld_qos_maxage; + } + + qos->lq_dirty = 0; + qos->lq_reset = 0; + + /* + * If each tgt has almost same free space, do rr allocation for better + * creation performance + */ + qos->lq_same_space = 0; + if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min && + (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) { + qos->lq_same_space = 1; + /* Reset weights for the next time we enter qos mode */ + qos->lq_reset = 1; + } + rc = 0; + +out: + if (!rc && qos->lq_same_space) + RETURN(-EAGAIN); + + RETURN(rc); +} +EXPORT_SYMBOL(ltd_qos_penalties_calc); + +/** + * Re-calculate penalties and weights of all tgts. + * + * The function is called when some target was used for a new object. In + * this case we should re-calculate all the weights to keep new allocations + * balanced well. + * + * \param[in] ltd lu_tgt_descs + * \param[in] tgt recently used tgt + * \param[out] total_wt new total weight for the pool + * + * \retval 0 + */ +int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, + __u64 *total_wt) +{ + struct lu_qos *qos = <d->ltd_qos; + struct lu_tgt_qos *ltq; + struct lu_svr_qos *svr; + + ENTRY; + + ltq = &tgt->ltd_qos; + LASSERT(ltq); + + /* Don't allocate on this device anymore, until the next alloc_qos */ + ltq->ltq_usable = 0; + + svr = ltq->ltq_svr; + + /* + * Decay old penalty by half (we're adding max penalty, and don't + * want it to run away.) + */ + ltq->ltq_penalty >>= 1; + svr->lsq_penalty >>= 1; + + /* mark the server and tgt as recently used */ + ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds(); + + /* Set max penalties for this tgt and server */ + ltq->ltq_penalty += ltq->ltq_penalty_per_obj * + ltd->ltd_lov_desc.ld_active_tgt_count; + svr->lsq_penalty += svr->lsq_penalty_per_obj * + ltd->ltd_lov_desc.ld_active_tgt_count; + + /* Decrease all MDS penalties */ + list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { + if (svr->lsq_penalty < svr->lsq_penalty_per_obj) + svr->lsq_penalty = 0; + else + svr->lsq_penalty -= svr->lsq_penalty_per_obj; + } + + *total_wt = 0; + /* Decrease all tgt penalties */ + ltd_foreach_tgt(ltd, tgt) { + if (!tgt->ltd_active) + continue; + + if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj) + ltq->ltq_penalty = 0; + else + ltq->ltq_penalty -= ltq->ltq_penalty_per_obj; + + lu_tgt_qos_weight_calc(tgt); + + /* Recalc the total weight of usable osts */ + if (ltq->ltq_usable) + *total_wt += ltq->ltq_weight; + + CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n", + tgt->ltd_index, ltq->ltq_usable, + tgt_statfs_bavail(tgt) >> 10, + ltq->ltq_penalty_per_obj >> 10, + ltq->ltq_penalty >> 10, + ltq->ltq_svr->lsq_penalty_per_obj >> 10, + ltq->ltq_svr->lsq_penalty >> 10, + ltq->ltq_weight >> 10); + } + + RETURN(0); } -EXPORT_SYMBOL(lu_tgt_descs_del); +EXPORT_SYMBOL(ltd_qos_update);