X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flod%2Flod_qos.c;h=fa5903eb7fa18ccd3648186104144fedd64e55ee;hp=fbc35cd0546044a65af24746221cc8353d953459;hb=b384ea39e593cda1ac4d6fb8b955d0c7d1a1f67b;hpb=645b72c5c0586e2183933ee52ad43d91c2eb3ad6;ds=sidebyside diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index fbc35cd..fa5903e 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -60,90 +60,131 @@ #define TGT_BAVAIL(i) (OST_TGT(lod,i)->ltd_statfs.os_bavail * \ OST_TGT(lod,i)->ltd_statfs.os_bsize) +static inline int lod_statfs_check(struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt) +{ + struct obd_statfs *sfs = &tgt->ltd_statfs; + + if (((sfs->os_state & OS_STATFS_ENOSPC) || + (!ltd->ltd_is_mdt && sfs->os_state & OS_STATFS_ENOINO && + sfs->os_fprecreated == 0))) + return -ENOSPC; + + /* If the OST is readonly then we can't allocate objects there */ + if (sfs->os_state & OS_STATFS_READONLY) + return -EROFS; + + /* object precreation is skipped on the OST with max_create_count=0 */ + if (!ltd->ltd_is_mdt && sfs->os_state & OS_STATFS_NOPRECREATE) + return -ENOBUFS; + + return 0; +} + /** - * Check whether the target is available for new OST objects. + * Check whether the target is available for new objects. * * Request statfs data from the given target and verify it's active and not - * read-only. If so, then it can be used to place new OST objects. This + * read-only. If so, then it can be used to place new objects. This * function also maintains the number of active/inactive targets and sets * dirty flags if those numbers change so others can run re-balance procedures. * No external locking is required. * * \param[in] env execution environment for this thread * \param[in] d LOD device - * \param[in] index index of OST target to check - * \param[out] sfs buffer for statfs data + * \param[in] ltd target table + * \param[in] tgt target * * \retval 0 if the target is good * \retval negative negated errno on error - */ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, - int index, struct obd_statfs *sfs) + struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt, __u64 reserve) { - struct lod_tgt_desc *ost; - int rc; + struct obd_statfs_info info = { 0 }; + struct lov_desc *desc = <d->ltd_lov_desc; + int rc; ENTRY; LASSERT(d); - ost = OST_TGT(d,index); - LASSERT(ost); - - rc = dt_statfs(env, ost->ltd_ost, sfs); - - if (rc == 0 && ((sfs->os_state & OS_STATE_ENOSPC) || - (sfs->os_state & OS_STATE_ENOINO && sfs->os_fprecreated == 0))) - RETURN(-ENOSPC); + LASSERT(tgt); + info.os_enable_pre = 1; + rc = dt_statfs_info(env, tgt->ltd_tgt, &tgt->ltd_statfs, &info); if (rc && rc != -ENOTCONN) CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc); - /* If the OST is readonly then we can't allocate objects there */ - if (sfs->os_state & OS_STATE_READONLY) - rc = -EROFS; + if (!rc) { + rc = lod_statfs_check(ltd, tgt); + if (rc == -ENOSPC) + return rc; + } - /* object precreation is skipped on the OST with max_create_count=0 */ - if (sfs->os_state & OS_STATE_NOPRECREATE) - rc = -ENOBUFS; + if (reserve && + (reserve + (info.os_reserved_mb_low << 20) > + tgt->ltd_statfs.os_bavail * tgt->ltd_statfs.os_bsize)) + return -ENOSPC; /* check whether device has changed state (active, inactive) */ - if (rc != 0 && ost->ltd_active) { + if (rc != 0 && tgt->ltd_active) { /* turned inactive? */ spin_lock(&d->lod_lock); - if (ost->ltd_active) { - ost->ltd_active = 0; + if (tgt->ltd_active) { + tgt->ltd_active = 0; if (rc == -ENOTCONN) - ost->ltd_connecting = 1; + tgt->ltd_connecting = 1; - LASSERT(d->lod_desc.ld_active_tgt_count > 0); - d->lod_desc.ld_active_tgt_count--; - d->lod_qos.lq_dirty = 1; - d->lod_qos.lq_rr.lqr_dirty = 1; + LASSERT(desc->ld_active_tgt_count > 0); + desc->ld_active_tgt_count--; + set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags); + set_bit(LQ_DIRTY, <d->ltd_qos.lq_rr.lqr_flags); CDEBUG(D_CONFIG, "%s: turns inactive\n", - ost->ltd_exp->exp_obd->obd_name); + tgt->ltd_exp->exp_obd->obd_name); } spin_unlock(&d->lod_lock); - } else if (rc == 0 && ost->ltd_active == 0) { + } else if (rc == 0 && tgt->ltd_active == 0) { /* turned active? */ - LASSERTF(d->lod_desc.ld_active_tgt_count < d->lod_ostnr, - "active tgt count %d, ost nr %d\n", - d->lod_desc.ld_active_tgt_count, d->lod_ostnr); + LASSERTF(desc->ld_active_tgt_count < desc->ld_tgt_count, + "active tgt count %d, tgt nr %d\n", + desc->ld_active_tgt_count, desc->ld_tgt_count); spin_lock(&d->lod_lock); - if (ost->ltd_active == 0) { - ost->ltd_active = 1; - ost->ltd_connecting = 0; - d->lod_desc.ld_active_tgt_count++; - d->lod_qos.lq_dirty = 1; - d->lod_qos.lq_rr.lqr_dirty = 1; + if (tgt->ltd_active == 0) { + tgt->ltd_active = 1; + tgt->ltd_connecting = 0; + desc->ld_active_tgt_count++; + set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags); + set_bit(LQ_DIRTY, <d->ltd_qos.lq_rr.lqr_flags); CDEBUG(D_CONFIG, "%s: turns active\n", - ost->ltd_exp->exp_obd->obd_name); + tgt->ltd_exp->exp_obd->obd_name); } spin_unlock(&d->lod_lock); } + if (rc == -ENOTCONN) { + /* In case that the ENOTCONN for inactive OST state is + * mistreated as MDT disconnection state by the client, + * this error should be changed to someone else. + */ + rc = -EREMOTEIO; + } RETURN(rc); } +static int lod_is_tgt_usable(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) +{ + int rc; + + rc = lod_statfs_check(ltd, tgt); + if (rc) + return rc; + + if (!tgt->ltd_active) + return -ENOTCONN; + + return 0; +} + /** * Maintain per-target statfs data. * @@ -153,293 +194,44 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, * * \param[in] env execution environment for this thread * \param[in] lod LOD device + * \param[in] ltd tgt table */ -void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod) +void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod, + struct lu_tgt_descs *ltd) { struct obd_device *obd = lod2obd(lod); - struct ost_pool *osts = &(lod->lod_pool_info); + struct lu_tgt_desc *tgt; time64_t max_age; - unsigned int i; u64 avail; - int idx; ENTRY; - max_age = ktime_get_seconds() - 2 * lod->lod_desc.ld_qos_maxage; + max_age = ktime_get_seconds() - 2 * ltd->ltd_lov_desc.ld_qos_maxage; if (obd->obd_osfs_age > max_age) /* statfs data are quite recent, don't need to refresh it */ RETURN_EXIT; - down_write(&lod->lod_qos.lq_rw_sem); + down_write(<d->ltd_qos.lq_rw_sem); if (obd->obd_osfs_age > max_age) goto out; - for (i = 0; i < osts->op_count; i++) { - idx = osts->op_array[i]; - avail = OST_TGT(lod,idx)->ltd_statfs.os_bavail; - if (lod_statfs_and_check(env, lod, idx, - &OST_TGT(lod, idx)->ltd_statfs)) + ltd_foreach_tgt(ltd, tgt) { + avail = tgt->ltd_statfs.os_bavail; + if (lod_statfs_and_check(env, lod, ltd, tgt, 0)) continue; - if (OST_TGT(lod,idx)->ltd_statfs.os_bavail != avail) + + if (tgt->ltd_statfs.os_bavail != avail) /* recalculate weigths */ - lod->lod_qos.lq_dirty = 1; + set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags); } obd->obd_osfs_age = ktime_get_seconds(); out: - up_write(&lod->lod_qos.lq_rw_sem); + up_write(<d->ltd_qos.lq_rw_sem); EXIT; } -/** - * Calculate per-OST and per-OSS penalties - * - * Re-calculate penalties when the configuration changes, active targets - * change and after statfs refresh (all these are reflected by lq_dirty flag). - * On every OST and OSS: decay the penalty by half for every 8x the update - * interval that the device has been idle. That gives lots of time for the - * statfs information to be updated (which the penalty is only a proxy for), - * and avoids penalizing OSS/OSTs under light load. - * See lod_qos_calc_weight() for how penalties are factored into the weight. - * - * \param[in] lod LOD device - * - * \retval 0 on success - * \retval -EAGAIN the number of OSTs isn't enough - */ -static int lod_qos_calc_ppo(struct lod_device *lod) -{ - struct lu_svr_qos *oss; - __u64 ba_max, ba_min, temp; - __u32 num_active; - unsigned int i; - int rc, prio_wide; - time64_t now, age; - - ENTRY; - - if (!lod->lod_qos.lq_dirty) - GOTO(out, rc = 0); - - num_active = lod->lod_desc.ld_active_tgt_count - 1; - if (num_active < 1) - GOTO(out, rc = -EAGAIN); - - /* find bavail on each OSS */ - list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) - oss->lsq_bavail = 0; - lod->lod_qos.lq_active_svr_count = 0; - - /* - * How badly user wants to select OSTs "widely" (not recently chosen - * and not on recent OSS's). As opposed to "freely" (free space - * avail.) 0-256 - */ - prio_wide = 256 - lod->lod_qos.lq_prio_free; - - ba_min = (__u64)(-1); - ba_max = 0; - now = ktime_get_real_seconds(); - /* Calculate OST penalty per object - * (lod ref taken in lod_qos_prep_create()) - */ - cfs_foreach_bit(lod->lod_ost_bitmap, i) { - LASSERT(OST_TGT(lod,i)); - temp = TGT_BAVAIL(i); - if (!temp) - continue; - ba_min = min(temp, ba_min); - ba_max = max(temp, ba_max); - - /* Count the number of usable OSS's */ - if (OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_bavail == 0) - lod->lod_qos.lq_active_svr_count++; - OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_bavail += temp; - - /* per-OST penalty is prio * TGT_bavail / (num_ost - 1) / 2 */ - temp >>= 1; - do_div(temp, num_active); - OST_TGT(lod,i)->ltd_qos.ltq_penalty_per_obj = - (temp * prio_wide) >> 8; - - age = (now - OST_TGT(lod,i)->ltd_qos.ltq_used) >> 3; - if (lod->lod_qos.lq_reset || - age > 32 * lod->lod_desc.ld_qos_maxage) - OST_TGT(lod,i)->ltd_qos.ltq_penalty = 0; - else if (age > lod->lod_desc.ld_qos_maxage) - /* Decay OST penalty. */ - OST_TGT(lod,i)->ltd_qos.ltq_penalty >>= - (age / lod->lod_desc.ld_qos_maxage); - } - - num_active = lod->lod_qos.lq_active_svr_count - 1; - if (num_active < 1) { - /* If there's only 1 OSS, we can't penalize it, so instead - we have to double the OST penalty */ - num_active = 1; - cfs_foreach_bit(lod->lod_ost_bitmap, i) - OST_TGT(lod,i)->ltd_qos.ltq_penalty_per_obj <<= 1; - } - - /* Per-OSS penalty is prio * oss_avail / oss_osts / (num_oss - 1) / 2 */ - list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) { - temp = oss->lsq_bavail >> 1; - do_div(temp, oss->lsq_tgt_count * num_active); - oss->lsq_penalty_per_obj = (temp * prio_wide) >> 8; - - age = (now - oss->lsq_used) >> 3; - if (lod->lod_qos.lq_reset || - age > 32 * lod->lod_desc.ld_qos_maxage) - oss->lsq_penalty = 0; - else if (age > lod->lod_desc.ld_qos_maxage) - /* Decay OSS penalty. */ - oss->lsq_penalty >>= age / lod->lod_desc.ld_qos_maxage; - } - - lod->lod_qos.lq_dirty = 0; - lod->lod_qos.lq_reset = 0; - - /* If each ost has almost same free space, - * do rr allocation for better creation performance */ - lod->lod_qos.lq_same_space = 0; - if ((ba_max * (256 - lod->lod_qos.lq_threshold_rr)) >> 8 < ba_min) { - lod->lod_qos.lq_same_space = 1; - /* Reset weights for the next time we enter qos mode */ - lod->lod_qos.lq_reset = 1; - } - rc = 0; - -out: -#ifndef FORCE_QOS - if (!rc && lod->lod_qos.lq_same_space) - RETURN(-EAGAIN); -#endif - RETURN(rc); -} - -/** - * Calculate weight for a given OST target. - * - * The final OST weight is the number of bytes available minus the OST and - * OSS penalties. See lod_qos_calc_ppo() for how penalties are calculated. - * - * \param[in] lod LOD device, where OST targets are listed - * \param[in] i OST target index - * - * \retval 0 - */ -static int lod_qos_calc_weight(struct lod_device *lod, int i) -{ - __u64 temp, temp2; - - temp = TGT_BAVAIL(i); - temp2 = OST_TGT(lod, i)->ltd_qos.ltq_penalty + - OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_penalty; - if (temp < temp2) - OST_TGT(lod, i)->ltd_qos.ltq_weight = 0; - else - OST_TGT(lod, i)->ltd_qos.ltq_weight = temp - temp2; - return 0; -} - -/** - * Re-calculate weights. - * - * The function is called when some OST target was used for a new object. In - * this case we should re-calculate all the weights to keep new allocations - * balanced well. - * - * \param[in] lod LOD device - * \param[in] osts OST pool where a new object was placed - * \param[in] index OST target where a new object was placed - * \param[out] total_wt new total weight for the pool - * - * \retval 0 - */ -static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts, - __u32 index, __u64 *total_wt) -{ - struct lod_tgt_desc *ost; - struct lu_svr_qos *oss; - unsigned int j; - ENTRY; - - ost = OST_TGT(lod,index); - LASSERT(ost); - - /* Don't allocate on this devuce anymore, until the next alloc_qos */ - ost->ltd_qos.ltq_usable = 0; - - oss = ost->ltd_qos.ltq_svr; - - /* Decay old penalty by half (we're adding max penalty, and don't - want it to run away.) */ - ost->ltd_qos.ltq_penalty >>= 1; - oss->lsq_penalty >>= 1; - - /* mark the OSS and OST as recently used */ - ost->ltd_qos.ltq_used = oss->lsq_used = ktime_get_real_seconds(); - - /* Set max penalties for this OST and OSS */ - ost->ltd_qos.ltq_penalty += - ost->ltd_qos.ltq_penalty_per_obj * lod->lod_ostnr; - oss->lsq_penalty += oss->lsq_penalty_per_obj * - lod->lod_qos.lq_active_svr_count; - - /* Decrease all OSS penalties */ - list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) { - if (oss->lsq_penalty < oss->lsq_penalty_per_obj) - oss->lsq_penalty = 0; - else - oss->lsq_penalty -= oss->lsq_penalty_per_obj; - } - - *total_wt = 0; - /* Decrease all OST penalties */ - for (j = 0; j < osts->op_count; j++) { - int i; - - i = osts->op_array[j]; - if (!cfs_bitmap_check(lod->lod_ost_bitmap, i)) - continue; - - ost = OST_TGT(lod,i); - LASSERT(ost); - - if (ost->ltd_qos.ltq_penalty < - ost->ltd_qos.ltq_penalty_per_obj) - ost->ltd_qos.ltq_penalty = 0; - else - ost->ltd_qos.ltq_penalty -= - ost->ltd_qos.ltq_penalty_per_obj; - - lod_qos_calc_weight(lod, i); - - /* Recalc the total weight of usable osts */ - if (ost->ltd_qos.ltq_usable) - *total_wt += ost->ltd_qos.ltq_weight; - - QOS_DEBUG("recalc tgt %d usable=%d avail=%llu" - " ostppo=%llu ostp=%llu ossppo=%llu" - " ossp=%llu wt=%llu\n", - i, ost->ltd_qos.ltq_usable, TGT_BAVAIL(i) >> 10, - ost->ltd_qos.ltq_penalty_per_obj >> 10, - ost->ltd_qos.ltq_penalty >> 10, - ost->ltd_qos.ltq_svr->lsq_penalty_per_obj >> 10, - ost->ltd_qos.ltq_svr->lsq_penalty >> 10, - ost->ltd_qos.ltq_weight >> 10); - } - - RETURN(0); -} - -void lod_qos_rr_init(struct lu_qos_rr *lqr) -{ - spin_lock_init(&lqr->lqr_alloc); - lqr->lqr_dirty = 1; -} - #define LOV_QOS_EMPTY ((__u32)-1) /** @@ -452,37 +244,39 @@ void lod_qos_rr_init(struct lu_qos_rr *lqr) * a new target or activation/deactivation). * * \param[in] lod LOD device - * \param[in] src_pool OST pool + * \param[in] ltd tgt table + * \param[in] src_pool tgt pool * \param[in] lqr round-robin list * * \retval 0 on success * \retval -ENOMEM fails to allocate the array */ -static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, +static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_descs *ltd, + const struct lu_tgt_pool *src_pool, struct lu_qos_rr *lqr) { - struct lu_svr_qos *oss; - struct lod_tgt_desc *ost; + struct lu_svr_qos *svr; + struct lu_tgt_desc *tgt; unsigned placed, real_count; unsigned int i; int rc; ENTRY; - if (!lqr->lqr_dirty) { + if (!test_bit(LQ_DIRTY, &lqr->lqr_flags)) { LASSERT(lqr->lqr_pool.op_size); RETURN(0); } /* Do actual allocation. */ - down_write(&lod->lod_qos.lq_rw_sem); + down_write(<d->ltd_qos.lq_rw_sem); /* * Check again. While we were sleeping on @lq_rw_sem something could * change. */ - if (!lqr->lqr_dirty) { + if (!test_bit(LQ_DIRTY, &lqr->lqr_flags)) { LASSERT(lqr->lqr_pool.op_size); - up_write(&lod->lod_qos.lq_rw_sem); + up_write(<d->ltd_qos.lq_rw_sem); RETURN(0); } @@ -493,33 +287,33 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, deleting from the pool. The lq_rw_sem insures that nobody else is reading. */ lqr->lqr_pool.op_count = real_count; - rc = lod_ost_pool_extend(&lqr->lqr_pool, real_count); + rc = tgt_pool_extend(&lqr->lqr_pool, real_count); if (rc) { - up_write(&lod->lod_qos.lq_rw_sem); + up_write(<d->ltd_qos.lq_rw_sem); RETURN(rc); } for (i = 0; i < lqr->lqr_pool.op_count; i++) lqr->lqr_pool.op_array[i] = LOV_QOS_EMPTY; - /* Place all the OSTs from 1 OSS at the same time. */ + /* Place all the tgts from 1 svr at the same time. */ placed = 0; - list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) { + list_for_each_entry(svr, <d->ltd_qos.lq_svr_list, lsq_svr_list) { int j = 0; for (i = 0; i < lqr->lqr_pool.op_count; i++) { int next; - if (!cfs_bitmap_check(lod->lod_ost_bitmap, - src_pool->op_array[i])) + if (!test_bit(src_pool->op_array[i], + ltd->ltd_tgt_bitmap)) continue; - ost = OST_TGT(lod,src_pool->op_array[i]); - LASSERT(ost && ost->ltd_ost); - if (ost->ltd_qos.ltq_svr != oss) + tgt = LTD_TGT(ltd, src_pool->op_array[i]); + LASSERT(tgt && tgt->ltd_tgt); + if (tgt->ltd_qos.ltq_svr != svr) continue; - /* Evenly space these OSTs across arrayspace */ - next = j * lqr->lqr_pool.op_count / oss->lsq_tgt_count; + /* Evenly space these tgts across arrayspace */ + next = j * lqr->lqr_pool.op_count / svr->lsq_tgt_count; while (lqr->lqr_pool.op_array[next] != LOV_QOS_EMPTY) next = (next + 1) % lqr->lqr_pool.op_count; @@ -529,19 +323,19 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, } } - lqr->lqr_dirty = 0; - up_write(&lod->lod_qos.lq_rw_sem); + clear_bit(LQ_DIRTY, &lqr->lqr_flags); + up_write(<d->ltd_qos.lq_rw_sem); if (placed != real_count) { /* This should never happen */ - LCONSOLE_ERROR_MSG(0x14e, "Failed to place all OSTs in the " + LCONSOLE_ERROR_MSG(0x14e, "Failed to place all tgts in the " "round-robin list (%d of %d).\n", placed, real_count); for (i = 0; i < lqr->lqr_pool.op_count; i++) { - LCONSOLE(D_WARNING, "rr #%d ost idx=%d\n", i, + LCONSOLE(D_WARNING, "rr #%d tgt idx=%d\n", i, lqr->lqr_pool.op_array[i]); } - lqr->lqr_dirty = 1; + set_bit(LQ_DIRTY, &lqr->lqr_flags); RETURN(-EAGAIN); } @@ -576,8 +370,10 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool, static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env, struct lod_device *d, __u32 ost_idx, + bool can_block, struct thandle *th) { + struct dt_allocation_hint *ah = &lod_env_info(env)->lti_ah; struct lod_tgt_desc *ost; struct lu_object *o, *n; struct lu_device *nd; @@ -586,12 +382,12 @@ static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env, ENTRY; LASSERT(d); - LASSERT(ost_idx < d->lod_osts_size); + LASSERT(ost_idx < d->lod_ost_descs.ltd_tgts_size); ost = OST_TGT(d,ost_idx); LASSERT(ost); - LASSERT(ost->ltd_ost); + LASSERT(ost->ltd_tgt); - nd = &ost->ltd_ost->dd_lu_dev; + nd = &ost->ltd_tgt->dd_lu_dev; /* * allocate anonymous object with zero fid, real fid @@ -600,7 +396,7 @@ static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env, */ o = lu_object_anon(env, nd, NULL); if (IS_ERR(o)) - GOTO(out, dt = ERR_PTR(PTR_ERR(o))); + GOTO(out, dt = ERR_CAST(o)); n = lu_object_locate(o->lo_header, nd->ld_type); if (unlikely(n == NULL)) { @@ -611,7 +407,8 @@ static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env, dt = container_of(n, struct dt_object, do_lu); - rc = lod_sub_declare_create(env, dt, NULL, NULL, NULL, th); + ah->dah_can_block = can_block; + rc = lod_sub_declare_create(env, dt, NULL, ah, NULL, th); if (rc < 0) { CDEBUG(D_OTHER, "can't declare creation on #%u: %d\n", ost_idx, rc); @@ -644,7 +441,7 @@ static int min_stripe_count(__u32 stripe_count, int flags) #define LOV_CREATE_RESEED_MIN 2000 /** - * Initialize temporary OST-in-use array. + * Initialize temporary tgt-in-use array. * * Allocate or extend the array used to mark targets already assigned to a new * striping so they are not used more than once. @@ -655,7 +452,7 @@ static int min_stripe_count(__u32 stripe_count, int flags) * \retval 0 on success * \retval -ENOMEM on error */ -static inline int lod_qos_ost_in_use_clear(const struct lu_env *env, +static inline int lod_qos_tgt_in_use_clear(const struct lu_env *env, __u32 stripes) { struct lod_thread_info *info = lod_env_info(env); @@ -663,7 +460,7 @@ static inline int lod_qos_ost_in_use_clear(const struct lu_env *env, if (info->lti_ea_store_size < sizeof(int) * stripes) lod_ea_store_resize(info, stripes * sizeof(int)); if (info->lti_ea_store_size < sizeof(int) * stripes) { - CERROR("can't allocate memory for ost-in-use array\n"); + CERROR("can't allocate memory for tgt-in-use array\n"); return -ENOMEM; } memset(info->lti_ea_store, -1, sizeof(int) * stripes); @@ -674,43 +471,44 @@ static inline int lod_qos_ost_in_use_clear(const struct lu_env *env, * Remember a target in the array of used targets. * * Mark the given target as used for a new striping being created. The status - * of an OST in a striping can be checked with lod_qos_is_ost_used(). + * of an tgt in a striping can be checked with lod_qos_is_tgt_used(). * * \param[in] env execution environment for this thread * \param[in] idx index in the array - * \param[in] ost OST target index to mark as used + * \param[in] tgt_idx target index to mark as used */ -static inline void lod_qos_ost_in_use(const struct lu_env *env, - int idx, int ost) +static inline void lod_qos_tgt_in_use(const struct lu_env *env, + int idx, int tgt_idx) { struct lod_thread_info *info = lod_env_info(env); - int *osts = info->lti_ea_store; + int *tgts = info->lti_ea_store; LASSERT(info->lti_ea_store_size >= idx * sizeof(int)); - osts[idx] = ost; + tgts[idx] = tgt_idx; } /** - * Check is OST used in a striping. + * Check is tgt used in a striping. * - * Checks whether OST with the given index is marked as used in the temporary - * array (see lod_qos_ost_in_use()). + * Checks whether tgt with the given index is marked as used in the temporary + * array (see lod_qos_tgt_in_use()). * * \param[in] env execution environment for this thread - * \param[in] ost OST target index to check + * \param[in] tgt_idx target index to check * \param[in] stripes the number of items used in the array already * * \retval 0 not used * \retval 1 used */ -static int lod_qos_is_ost_used(const struct lu_env *env, int ost, __u32 stripes) +static int lod_qos_is_tgt_used(const struct lu_env *env, int tgt_idx, + __u32 stripes) { struct lod_thread_info *info = lod_env_info(env); - int *osts = info->lti_ea_store; + int *tgts = info->lti_ea_store; __u32 j; for (j = 0; j < stripes; j++) { - if (osts[j] == ost) + if (tgts[j] == tgt_idx) return 1; } return 0; @@ -784,7 +582,7 @@ static inline bool lod_should_avoid_ost(struct lod_object *lo, bool used = false; int i; - if (!cfs_bitmap_check(lod->lod_ost_bitmap, index)) { + if (!test_bit(index, lod->lod_ost_bitmap)) { QOS_DEBUG("OST%d: been used in conflicting mirror component\n", index); return true; @@ -812,7 +610,7 @@ static inline bool lod_should_avoid_ost(struct lod_object *lo, return false; /* if the OSS has been used, check whether the OST has been used */ - if (!cfs_bitmap_check(lag->lag_ost_avoid_bitmap, index)) + if (!test_bit(index, lag->lag_ost_avoid_bitmap)) used = false; else QOS_DEBUG("OST%d: been used in conflicting mirror component\n", @@ -823,21 +621,23 @@ static inline bool lod_should_avoid_ost(struct lod_object *lo, static int lod_check_and_reserve_ost(const struct lu_env *env, struct lod_object *lo, struct lod_layout_component *lod_comp, - struct obd_statfs *sfs, __u32 ost_idx, - __u32 speed, __u32 *s_idx, + __u32 ost_idx, __u32 speed, __u32 *s_idx, struct dt_object **stripe, __u32 *ost_indices, struct thandle *th, - bool *overstriped) + bool *overstriped, + __u64 reserve) { struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid; + struct lu_tgt_desc *ost = OST_TGT(lod, ost_idx); struct dt_object *o; __u32 stripe_idx = *s_idx; int rc; + ENTRY; - rc = lod_statfs_and_check(env, lod, ost_idx, sfs); + rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost, reserve); if (rc) RETURN(rc); @@ -845,7 +645,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * We expect number of precreated objects in f_ffree at * the first iteration, skip OSPs with no objects ready */ - if (sfs->os_fprecreated == 0 && speed == 0) { + if (ost->ltd_statfs.os_fprecreated == 0 && speed == 0) { QOS_DEBUG("#%d: precreation is empty\n", ost_idx); RETURN(rc); } @@ -853,7 +653,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, /* * try to use another OSP if this one is degraded */ - if (sfs->os_state & OS_STATE_DEGRADED && speed < 2) { + if (ost->ltd_statfs.os_state & OS_STATFS_DEGRADED && speed < 2) { QOS_DEBUG("#%d: degraded\n", ost_idx); RETURN(rc); } @@ -873,20 +673,20 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * for the first and second time. */ if (speed < 2 && lod_should_avoid_ost(lo, lag, ost_idx)) { - QOS_DEBUG("iter %d: OST%d used by conflicting mirror " - "component\n", speed, ost_idx); + QOS_DEBUG("iter %d: OST%d used by conflicting mirror component\n", + speed, ost_idx); RETURN(rc); } /* do not put >1 objects on a single OST, except for overstriping */ - if (lod_qos_is_ost_used(env, ost_idx, stripe_idx)) { + if (lod_qos_is_tgt_used(env, ost_idx, stripe_idx)) { if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) *overstriped = true; else RETURN(rc); } - o = lod_qos_declare_object_on(env, lod, ost_idx, th); + o = lod_qos_declare_object_on(env, lod, ost_idx, true, th); if (IS_ERR(o)) { CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n", ost_idx, (int) PTR_ERR(o)); @@ -898,7 +698,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * We've successfully declared (reserved) an object */ lod_avoid_update(lo, lag); - lod_qos_ost_in_use(env, stripe_idx, ost_idx); + lod_qos_tgt_in_use(env, stripe_idx, ost_idx); stripe[stripe_idx] = o; ost_indices[stripe_idx] = ost_idx; OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LOV_CREATE_RACE, 2); @@ -934,17 +734,17 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * \retval -ENOSPC if not enough OSTs are found * \retval negative negated errno for other failures */ -static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, - struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx) +static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripe, __u32 *ost_indices, + int flags, struct thandle *th, int comp_idx, + __u64 reserve) { struct lod_layout_component *lod_comp; struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct pool_desc *pool = NULL; - struct ost_pool *osts; + struct lu_tgt_pool *osts; struct lu_qos_rr *lqr; - unsigned int i, array_idx; + unsigned int i, array_idx; __u32 ost_start_idx_temp; __u32 stripe_idx = 0; __u32 stripe_count, stripe_count_min, ost_idx; @@ -966,19 +766,19 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, osts = &(pool->pool_obds); lqr = &(pool->pool_rr); } else { - osts = &(m->lod_pool_info); - lqr = &(m->lod_qos.lq_rr); + osts = &m->lod_ost_descs.ltd_tgt_pool; + lqr = &(m->lod_ost_descs.ltd_qos.lq_rr); } - rc = lod_qos_calc_rr(m, osts, lqr); + rc = lod_qos_calc_rr(m, &m->lod_ost_descs, osts, lqr); if (rc) GOTO(out, rc); - rc = lod_qos_ost_in_use_clear(env, stripe_count); + rc = lod_qos_tgt_in_use_clear(env, stripe_count); if (rc) GOTO(out, rc); - down_read(&m->lod_qos.lq_rw_sem); + down_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem); spin_lock(&lqr->lqr_alloc); if (--lqr->lqr_start_count <= 0) { lqr->lqr_start_idx = prandom_u32_max(osts->op_count); @@ -1020,7 +820,7 @@ repeat_find: stripe_idx, array_idx, ost_idx); if ((ost_idx == LOV_QOS_EMPTY) || - !cfs_bitmap_check(m->lod_ost_bitmap, ost_idx)) + !test_bit(ost_idx, m->lod_ost_bitmap)) continue; /* Fail Check before osc_precreate() is called @@ -1029,9 +829,10 @@ repeat_find: continue; spin_unlock(&lqr->lqr_alloc); - rc = lod_check_and_reserve_ost(env, lo, lod_comp, sfs, ost_idx, + rc = lod_check_and_reserve_ost(env, lo, lod_comp, ost_idx, speed, &stripe_idx, stripe, - ost_indices, th, &overstriped); + ost_indices, th, &overstriped, + reserve); spin_lock(&lqr->lqr_alloc); if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting) @@ -1047,7 +848,7 @@ repeat_find: } spin_unlock(&lqr->lqr_alloc); - up_read(&m->lod_qos.lq_rw_sem); + up_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem); /* If there are enough OSTs, a component with overstriping requested * will not actually end up overstriped. The comp should reflect this. @@ -1077,6 +878,203 @@ out: RETURN(rc); } +static int +lod_qos_mdt_in_use_init(const struct lu_env *env, + const struct lu_tgt_descs *ltd, + u32 stripe_idx, u32 stripe_count, + const struct lu_tgt_pool *pool, + struct dt_object **stripes) +{ + u32 mdt_idx; + struct lu_tgt_desc *mdt; + int i, j; + int rc; + + rc = lod_qos_tgt_in_use_clear(env, stripe_count); + if (rc) + return rc; + + /* if stripe_idx > 1, we are splitting directory, mark existing stripes + * in_use. Because for either split or creation, stripe 0 is local, + * don't mark it in use. + */ + for (i = 1; i < stripe_idx; i++) { + LASSERT(stripes[i]); + for (j = 0; j < pool->op_count; j++) { + mdt_idx = pool->op_array[j]; + + if (!test_bit(mdt_idx, ltd->ltd_tgt_bitmap)) + continue; + + mdt = LTD_TGT(ltd, mdt_idx); + if (&mdt->ltd_tgt->dd_lu_dev == + stripes[i]->do_lu.lo_dev) + lod_qos_tgt_in_use(env, i, mdt_idx); + } + } + + return 0; +} + +/** + * Allocate a striping using round-robin algorithm. + * + * Allocates a new striping using round-robin algorithm. The function refreshes + * all the internal structures (statfs cache, array of available remote MDTs + * sorted with regard to MDS, etc). The number of stripes required is taken from + * the object (must be prepared by the caller). The caller should ensure nobody + * else is trying to create a striping on the object in parallel. All the + * internal structures (like pools, etc) are protected and no additional locking + * is required. The function succeeds even if a single stripe is allocated. + * + * \param[in] env execution environment for this thread + * \param[in] lo LOD object + * \param[out] stripes striping created + * + * \retval positive stripe objects allocated, including the first stripe + * allocated outside + * \retval -ENOSPC if not enough MDTs are found + * \retval negative negated errno for other failures + */ +int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripes, u32 stripe_idx, + u32 stripe_count) +{ + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + struct lu_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lu_tgt_pool *pool; + struct lu_qos_rr *lqr; + struct lu_tgt_desc *mdt; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + struct lu_fid fid = { 0 }; + struct dt_object *dto; + unsigned int pool_idx; + unsigned int i; + u32 saved_idx = stripe_idx; + u32 start_mdt; + u32 mdt_idx; + bool use_degraded = false; + int tgt_connecting = 0; + int rc; + + ENTRY; + + pool = <d->ltd_tgt_pool; + lqr = <d->ltd_qos.lq_rr; + rc = lod_qos_calc_rr(lod, ltd, pool, lqr); + if (rc) + RETURN(rc); + + rc = lod_qos_mdt_in_use_init(env, ltd, stripe_idx, stripe_count, pool, + stripes); + if (rc) + RETURN(rc); + + down_read(<d->ltd_qos.lq_rw_sem); + spin_lock(&lqr->lqr_alloc); + if (--lqr->lqr_start_count <= 0) { + lqr->lqr_start_idx = prandom_u32_max(pool->op_count); + lqr->lqr_start_count = + (LOV_CREATE_RESEED_MIN / max(pool->op_count, 1U) + + LOV_CREATE_RESEED_MULT) * max(pool->op_count, 1U); + } else if (stripe_count - 1 >= pool->op_count || + lqr->lqr_start_idx > pool->op_count) { + /* If we have allocated from all of the tgts, slowly + * precess the next start if the tgt/stripe count isn't + * already doing this for us. */ + lqr->lqr_start_idx %= pool->op_count; + if (stripe_count - 1 > 1 && + (pool->op_count % (stripe_count - 1)) != 1) + ++lqr->lqr_offset_idx; + } + start_mdt = lqr->lqr_start_idx; + +repeat_find: + QOS_DEBUG("want=%d start_idx=%d start_count=%d offset=%d active=%d count=%d\n", + stripe_count - 1, lqr->lqr_start_idx, lqr->lqr_start_count, + lqr->lqr_offset_idx, pool->op_count, pool->op_count); + + for (i = 0; i < pool->op_count && stripe_idx < stripe_count; i++) { + pool_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % + pool->op_count; + ++lqr->lqr_start_idx; + mdt_idx = lqr->lqr_pool.op_array[pool_idx]; + mdt = LTD_TGT(ltd, mdt_idx); + + QOS_DEBUG("#%d strt %d act %d strp %d ary %d idx %d\n", + i, lqr->lqr_start_idx, /* XXX: active*/ 0, + stripe_idx, pool_idx, mdt_idx); + + if (mdt_idx == LOV_QOS_EMPTY || + !test_bit(mdt_idx, ltd->ltd_tgt_bitmap)) + continue; + + /* do not put >1 objects on one MDT */ + if (lod_qos_is_tgt_used(env, mdt_idx, stripe_idx)) + continue; + + rc = lod_is_tgt_usable(ltd, mdt); + if (rc) { + if (mdt->ltd_connecting) + tgt_connecting = 1; + continue; + } + + /* try to use another OSP if this one is degraded */ + if (mdt->ltd_statfs.os_state & OS_STATFS_DEGRADED && + !use_degraded) { + QOS_DEBUG("#%d: degraded\n", mdt_idx); + continue; + } + spin_unlock(&lqr->lqr_alloc); + + rc = dt_fid_alloc(env, mdt->ltd_tgt, &fid, NULL, NULL); + if (rc < 0) { + QOS_DEBUG("#%d: alloc FID failed: %dl\n", mdt_idx, rc); + spin_lock(&lqr->lqr_alloc); + continue; + } + + dto = dt_locate_at(env, mdt->ltd_tgt, &fid, + lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev, + &conf); + + spin_lock(&lqr->lqr_alloc); + if (IS_ERR(dto)) { + QOS_DEBUG("can't alloc stripe on #%u: %d\n", + mdt->ltd_index, (int) PTR_ERR(dto)); + + if (mdt->ltd_connecting) + tgt_connecting = 1; + continue; + } + + lod_qos_tgt_in_use(env, stripe_idx, mdt_idx); + stripes[stripe_idx++] = dto; + } + + if (!use_degraded && stripe_idx < stripe_count) { + /* Try again, allowing slower MDTs */ + use_degraded = true; + lqr->lqr_start_idx = start_mdt; + + tgt_connecting = 0; + goto repeat_find; + } + spin_unlock(&lqr->lqr_alloc); + up_read(<d->ltd_qos.lq_rw_sem); + + if (stripe_idx > saved_idx) + /* at least one stripe is allocated */ + RETURN(stripe_idx); + + /* nobody provided us with a single object */ + if (tgt_connecting) + RETURN(-EINPROGRESS); + + RETURN(-ENOSPC); +} + /** * Allocate a specific striping layout on a user defined set of OSTs. * @@ -1104,11 +1102,10 @@ out: */ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, struct dt_object **stripe, __u32 *ost_indices, - struct thandle *th, int comp_idx) + struct thandle *th, int comp_idx, __u64 reserve) { struct lod_layout_component *lod_comp; struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct dt_object *o; unsigned int array_idx = 0; int stripe_count = 0; @@ -1122,7 +1119,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, LASSERT(lod_comp->llc_ostlist.op_array); LASSERT(lod_comp->llc_ostlist.op_count); - rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count); + rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count); if (rc < 0) RETURN(rc); @@ -1148,7 +1145,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, i++, array_idx = (array_idx + 1) % lod_comp->llc_stripe_count) { __u32 ost_idx = lod_comp->llc_ostlist.op_array[array_idx]; - if (!cfs_bitmap_check(m->lod_ost_bitmap, ost_idx)) { + if (!test_bit(ost_idx, m->lod_ost_bitmap)) { rc = -ENODEV; break; } @@ -1156,17 +1153,19 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, /* do not put >1 objects on a single OST, except for * overstriping */ - if (lod_qos_is_ost_used(env, ost_idx, stripe_count) && + if (lod_qos_is_tgt_used(env, ost_idx, stripe_count) && !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)) { rc = -EINVAL; break; } - rc = lod_statfs_and_check(env, m, ost_idx, sfs); + rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, + LTD_TGT(&m->lod_ost_descs, ost_idx), + reserve); if (rc < 0) /* this OSP doesn't feel well */ break; - o = lod_qos_declare_object_on(env, m, ost_idx, th); + o = lod_qos_declare_object_on(env, m, ost_idx, true, th); if (IS_ERR(o)) { rc = PTR_ERR(o); CDEBUG(D_OTHER, @@ -1178,7 +1177,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, /* * We've successfully declared (reserved) an object */ - lod_qos_ost_in_use(env, stripe_count, ost_idx); + lod_qos_tgt_in_use(env, stripe_count, ost_idx); stripe[stripe_count] = o; ost_indices[stripe_count] = ost_idx; stripe_count++; @@ -1213,20 +1212,22 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, * \retval -EINVAL requested offset is invalid * \retval negative errno on failure */ -static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo, - struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx) +static int lod_ost_alloc_specific(const struct lu_env *env, + struct lod_object *lo, + struct dt_object **stripe, __u32 *ost_indices, + int flags, struct thandle *th, int comp_idx, + __u64 reserve) { struct lod_layout_component *lod_comp; struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct dt_object *o; + struct lu_tgt_desc *tgt; __u32 ost_idx; unsigned int i, array_idx, ost_count; int rc, stripe_num = 0; int speed = 0; - struct pool_desc *pool = NULL; - struct ost_pool *osts; + struct pool_desc *pool = NULL; + struct lu_tgt_pool *osts; int stripes_per_ost = 1; bool overstriped = false; ENTRY; @@ -1234,7 +1235,7 @@ static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo, LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL); lod_comp = &lo->ldo_comp_entries[comp_idx]; - rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count); + rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count); if (rc) GOTO(out, rc); @@ -1245,7 +1246,7 @@ static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo, down_read(&pool_tgt_rw_sem(pool)); osts = &(pool->pool_obds); } else { - osts = &(m->lod_pool_info); + osts = &m->lod_ost_descs.ltd_tgt_pool; } ost_count = osts->op_count; @@ -1274,7 +1275,7 @@ repeat_find: i++, array_idx = (array_idx + 1) % ost_count) { ost_idx = osts->op_array[array_idx]; - if (!cfs_bitmap_check(m->lod_ost_bitmap, ost_idx)) + if (!test_bit(ost_idx, m->lod_ost_bitmap)) continue; /* Fail Check before osc_precreate() is called @@ -1286,7 +1287,7 @@ repeat_find: * do not put >1 objects on a single OST, except for * overstriping, where it is intended */ - if (lod_qos_is_ost_used(env, ost_idx, stripe_num)) { + if (lod_qos_is_tgt_used(env, ost_idx, stripe_num)) { if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) overstriped = true; else @@ -1300,13 +1301,16 @@ repeat_find: lod_comp_is_ost_used(env, lo, ost_idx)) continue; + tgt = LTD_TGT(&m->lod_ost_descs, ost_idx); + /* Drop slow OSCs if we can, but not for requested start idx. * * This means "if OSC is slow and it is not the requested * start OST, then it can be skipped, otherwise skip it only * if it is inactive/recovering/out-of-space." */ - rc = lod_statfs_and_check(env, m, ost_idx, sfs); + rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, + tgt, reserve); if (rc) { /* this OSP doesn't feel well */ continue; @@ -1317,10 +1321,10 @@ repeat_find: * iteration. Skip OSPs with no objects ready. Don't apply * this logic to OST specified with stripe_offset. */ - if (i != 0 && sfs->os_fprecreated == 0 && speed == 0) + if (i && !tgt->ltd_statfs.os_fprecreated && !speed) continue; - o = lod_qos_declare_object_on(env, m, ost_idx, th); + o = lod_qos_declare_object_on(env, m, ost_idx, true, th); if (IS_ERR(o)) { CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n", ost_idx, (int) PTR_ERR(o)); @@ -1330,7 +1334,7 @@ repeat_find: /* * We've successfully declared (reserved) an object */ - lod_qos_ost_in_use(env, stripe_num, ost_idx); + lod_qos_tgt_in_use(env, stripe_num, ost_idx); stripe[stripe_num] = o; ost_indices[stripe_num] = ost_idx; stripe_num++; @@ -1371,36 +1375,6 @@ out: } /** - * Check whether QoS allocation should be used. - * - * A simple helper to decide when QoS allocation should be used: - * if it's just a single available target or the used space is - * evenly distributed among the targets at the moment, then QoS - * allocation algorithm should not be used. - * - * \param[in] lod LOD device - * - * \retval 0 should not be used - * \retval 1 should be used - */ -static inline int lod_qos_is_usable(struct lod_device *lod) -{ -#ifdef FORCE_QOS - /* to be able to debug QoS code */ - return 1; -#endif - - /* Detect -EAGAIN early, before expensive lock is taken. */ - if (!lod->lod_qos.lq_dirty && lod->lod_qos.lq_same_space) - return 0; - - if (lod->lod_desc.ld_active_tgt_count < 2) - return 0; - - return 1; -} - -/** * Allocate a striping using an algorithm with weights. * * The function allocates OST objects to create a striping. The algorithm @@ -1435,23 +1409,24 @@ static inline int lod_qos_is_usable(struct lod_device *lod) * \retval -EINVAL requested OST index is invalid * \retval negative errno on failure */ -static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, - struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx) +static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripe, __u32 *ost_indices, + int flags, struct thandle *th, int comp_idx, + __u64 reserve) { struct lod_layout_component *lod_comp; struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid; struct lod_tgt_desc *ost; struct dt_object *o; __u64 total_weight = 0; struct pool_desc *pool = NULL; - struct ost_pool *osts; + struct lu_tgt_pool *osts; unsigned int i; __u32 nfound, good_osts, stripe_count, stripe_count_min; bool overstriped = false; int stripes_per_ost = 1; + bool slow = false; int rc = 0; ENTRY; @@ -1469,11 +1444,11 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, down_read(&pool_tgt_rw_sem(pool)); osts = &(pool->pool_obds); } else { - osts = &(lod->lod_pool_info); + osts = &lod->lod_ost_descs.ltd_tgt_pool; } /* Detect -EAGAIN early, before expensive lock is taken. */ - if (!lod_qos_is_usable(lod)) + if (!ltd_qos_is_usable(&lod->lod_ost_descs)) GOTO(out_nolock, rc = -EAGAIN); if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) @@ -1481,49 +1456,51 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, (lod_comp->llc_stripe_count - 1)/osts->op_count + 1; /* Do actual allocation, use write lock here. */ - down_write(&lod->lod_qos.lq_rw_sem); + down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); /* * Check again, while we were sleeping on @lq_rw_sem things could * change. */ - if (!lod_qos_is_usable(lod)) + if (!ltd_qos_is_usable(&lod->lod_ost_descs)) GOTO(out, rc = -EAGAIN); - rc = lod_qos_calc_ppo(lod); + rc = ltd_qos_penalties_calc(&lod->lod_ost_descs); if (rc) GOTO(out, rc); - rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count); + rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count); if (rc) GOTO(out, rc); good_osts = 0; /* Find all the OSTs that are valid stripe candidates */ for (i = 0; i < osts->op_count; i++) { - if (!cfs_bitmap_check(lod->lod_ost_bitmap, osts->op_array[i])) + if (!test_bit(osts->op_array[i], lod->lod_ost_bitmap)) continue; ost = OST_TGT(lod, osts->op_array[i]); ost->ltd_qos.ltq_usable = 0; - rc = lod_statfs_and_check(env, lod, osts->op_array[i], sfs); + rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, + ost, reserve); if (rc) { /* this OSP doesn't feel well */ continue; } - if (sfs->os_state & OS_STATE_DEGRADED) + if (ost->ltd_statfs.os_state & OS_STATFS_DEGRADED) continue; /* Fail Check before osc_precreate() is called - so we can only 'fail' single OSC. */ + * so we can only 'fail' single OSC. + */ if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && osts->op_array[i] == 0) continue; ost->ltd_qos.ltq_usable = 1; - lod_qos_calc_weight(lod, osts->op_array[i]); + lu_tgt_qos_weight_calc(ost); total_weight += ost->ltd_qos.ltq_weight; good_osts++; @@ -1551,9 +1528,11 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, rand = lu_prandom_u64_max(total_weight); /* On average, this will hit larger-weighted OSTs more often. - * 0-weight OSTs will always get used last (only when rand=0) */ + * 0-weight OSTs will always get used last (only when rand=0) + */ for (i = 0; i < osts->op_count; i++) { __u32 idx = osts->op_array[i]; + struct lod_tgt_desc *ost = OST_TGT(lod, idx); if (lod_should_avoid_ost(lo, lag, idx)) continue; @@ -1581,7 +1560,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)) continue; - if (lod_qos_is_ost_used(env, idx, nfound)) { + if (lod_qos_is_tgt_used(env, idx, nfound)) { if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) overstriped = true; @@ -1589,7 +1568,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, continue; } - o = lod_qos_declare_object_on(env, lod, idx, th); + o = lod_qos_declare_object_on(env, lod, idx, slow, th); if (IS_ERR(o)) { QOS_DEBUG("can't declare object on #%u: %d\n", idx, (int) PTR_ERR(o)); @@ -1597,15 +1576,22 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, } lod_avoid_update(lo, lag); - lod_qos_ost_in_use(env, nfound, idx); + lod_qos_tgt_in_use(env, nfound, idx); stripe[nfound] = o; ost_indices[nfound] = idx; - lod_qos_used(lod, osts, idx, &total_weight); + ltd_qos_update(&lod->lod_ost_descs, ost, &total_weight); nfound++; rc = 0; break; } + if (rc && !slow && nfound < stripe_count) { + /* couldn't allocate using precreated objects + * so try to wait for new precreations */ + slow = true; + rc = 0; + } + if (rc) { /* no OST found on this iteration, give up */ break; @@ -1629,9 +1615,8 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, } /* makes sense to rebalance next time */ - lod->lod_qos.lq_dirty = 1; - lod->lod_qos.lq_same_space = 0; - + set_bit(LQ_DIRTY, &lod->lod_ost_descs.ltd_qos.lq_flags); + clear_bit(LQ_SAME_SPACE, &lod->lod_ost_descs.ltd_qos.lq_flags); rc = -EAGAIN; } @@ -1642,7 +1627,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, lod_comp->llc_pattern &= ~LOV_PATTERN_OVERSTRIPING; out: - up_write(&lod->lod_qos.lq_rw_sem); + up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); out_nolock: if (pool != NULL) { @@ -1655,6 +1640,216 @@ out_nolock: } /** + * Allocate a striping using an algorithm with weights. + * + * The function allocates remote MDT objects to create a striping, the first + * object was already allocated on current MDT to ensure master object and + * the first object are on the same MDT. The algorithm used is based on weights + * (both free space and inodes), and it's trying to ensure the space/inodes are + * used evenly by MDTs and MDSs. The striping configuration (# of stripes, + * offset, pool) is taken from the object and is prepared by the caller. + * + * If prepared configuration can't be met due to too few MDTs, then allocation + * fails. + * + * No concurrent allocation is allowed on the object and this must be ensured + * by the caller. All the internal structures are protected by the function. + * + * The algorithm has two steps: find available MDTs and calculate their + * weights, then select the MDTs with their weights used as the probability. + * An MDT with a higher weight is proportionately more likely to be selected + * than one with a lower weight. + * + * \param[in] env execution environment for this thread + * \param[in] lo LOD object + * \param[in] stripe_idx starting stripe index to allocate, if it's not + * 0, we are restriping directory + * \param[in] stripe_count total stripe count + * \param[out] stripes striping created + * + * \retval positive stripes allocated, and it should be equal to + * lo->ldo_dir_stripe_count + * \retval -EAGAIN not enough tgts are found for specified stripe count + * \retval -EINVAL requested MDT index is invalid + * \retval negative errno on failure + */ +int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripes, u32 stripe_idx, + u32 stripe_count) +{ + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + struct lu_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + struct lu_fid fid = { 0 }; + const struct lu_tgt_pool *pool; + struct lu_tgt_desc *mdt; + struct dt_object *dto; + u64 total_weight = 0; + u32 saved_idx = stripe_idx; + u32 mdt_idx; + unsigned int good_mdts; + unsigned int i; + int rc = 0; + + ENTRY; + + LASSERT(stripe_idx <= stripe_count); + if (stripe_idx == stripe_count) + RETURN(stripe_count); + + /* use MDT pool in @ltd, once MDT pool is supported in the future, it + * can be passed in as argument like OST object allocation. + */ + pool = <d->ltd_tgt_pool; + + /* Detect -EAGAIN early, before expensive lock is taken. */ + if (!ltd_qos_is_usable(ltd)) + RETURN(-EAGAIN); + + rc = lod_qos_mdt_in_use_init(env, ltd, stripe_idx, stripe_count, pool, + stripes); + if (rc) + RETURN(rc); + + /* Do actual allocation, use write lock here. */ + down_write(<d->ltd_qos.lq_rw_sem); + + /* + * Check again, while we were sleeping on @lq_rw_sem things could + * change. + */ + if (!ltd_qos_is_usable(ltd)) + GOTO(unlock, rc = -EAGAIN); + + rc = ltd_qos_penalties_calc(ltd); + if (rc) + GOTO(unlock, rc); + + good_mdts = 0; + /* Find all the MDTs that are valid stripe candidates */ + for (i = 0; i < pool->op_count; i++) { + if (!test_bit(pool->op_array[i], ltd->ltd_tgt_bitmap)) + continue; + + mdt = LTD_TGT(ltd, pool->op_array[i]); + mdt->ltd_qos.ltq_usable = 0; + + rc = lod_is_tgt_usable(ltd, mdt); + if (rc) + continue; + + if (mdt->ltd_statfs.os_state & OS_STATFS_DEGRADED) + continue; + + mdt->ltd_qos.ltq_usable = 1; + lu_tgt_qos_weight_calc(mdt); + total_weight += mdt->ltd_qos.ltq_weight; + + good_mdts++; + } + + QOS_DEBUG("found %d good MDTs\n", good_mdts); + + if (good_mdts < stripe_count - stripe_idx) + GOTO(unlock, rc = -EAGAIN); + + /* Find enough MDTs with weighted random allocation. */ + while (stripe_idx < stripe_count) { + u64 rand, cur_weight; + + cur_weight = 0; + rc = -ENOSPC; + + rand = lu_prandom_u64_max(total_weight); + + /* On average, this will hit larger-weighted MDTs more often. + * 0-weight MDT will always get used last (only when rand=0) */ + for (i = 0; i < pool->op_count; i++) { + int rc2; + + mdt_idx = pool->op_array[i]; + mdt = LTD_TGT(ltd, mdt_idx); + + if (!mdt->ltd_qos.ltq_usable) + continue; + + cur_weight += mdt->ltd_qos.ltq_weight; + + QOS_DEBUG("stripe_count=%d stripe_index=%d cur_weight=%llu rand=%llu total_weight=%llu\n", + stripe_count, stripe_idx, cur_weight, rand, + total_weight); + + if (cur_weight < rand) + continue; + + QOS_DEBUG("stripe=%d to idx=%d\n", + stripe_idx, mdt_idx); + + if (lod_qos_is_tgt_used(env, mdt_idx, stripe_idx)) + continue; + + rc2 = dt_fid_alloc(env, mdt->ltd_tgt, &fid, NULL, NULL); + if (rc2 < 0) { + QOS_DEBUG("can't alloc FID on #%u: %d\n", + mdt_idx, rc2); + continue; + } + + conf.loc_flags = LOC_F_NEW; + dto = dt_locate_at(env, mdt->ltd_tgt, &fid, + lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev, + &conf); + if (IS_ERR(dto)) { + QOS_DEBUG("can't alloc stripe on #%u: %d\n", + mdt_idx, (int) PTR_ERR(dto)); + continue; + } + + lod_qos_tgt_in_use(env, stripe_idx, mdt_idx); + stripes[stripe_idx] = dto; + ltd_qos_update(ltd, mdt, &total_weight); + stripe_idx++; + rc = 0; + break; + } + + /* no MDT found on this iteration, give up */ + if (rc) + break; + } + + if (unlikely(stripe_idx != stripe_count)) { + /* + * when the decision to use weighted algorithm was made + * we had enough appropriate OSPs, but this state can + * change anytime (no space on MDT, broken connection, etc) + * so it's possible OSP won't be able to provide us with + * an object due to just changed state + */ + QOS_DEBUG("%s: wanted %d objects, found only %d\n", + lod2obd(lod)->obd_name, stripe_count, stripe_idx); + for (i = saved_idx; i < stripe_idx; i++) { + LASSERT(stripes[i] != NULL); + dt_object_put(env, stripes[i]); + stripes[i] = NULL; + } + + /* makes sense to rebalance next time */ + set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags); + clear_bit(LQ_SAME_SPACE, <d->ltd_qos.lq_flags); + + rc = -EAGAIN; + } else { + rc = stripe_idx; + } + +unlock: + up_write(<d->ltd_qos.lq_rw_sem); + + RETURN(rc); +} + +/** * Check stripe count the caller can use. * * For new layouts (no initialized components), check the total size of the @@ -1667,26 +1862,31 @@ out_nolock: * * \param[in] lod LOD device * \param[in] lo The lod_object + * \param[in] comp_idx The component id, which the amount of stripes is + calculated for * \param[in] stripe_count count the caller would like to use * * \retval the maximum usable stripe count */ __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo, - __u16 stripe_count, bool overstriping) + int comp_idx, __u16 stripe_count, bool overstriping) { __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD; /* max stripe count is based on OSD ea size */ unsigned int easize = lod->lod_osd_max_easize; int i; - if (!stripe_count) - stripe_count = lod->lod_desc.ld_default_stripe_count; + stripe_count = + lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_count; if (!stripe_count) stripe_count = 1; /* Overstriping allows more stripes than targets */ - if (stripe_count > lod->lod_desc.ld_active_tgt_count && !overstriping) - stripe_count = lod->lod_desc.ld_active_tgt_count; + if (stripe_count > + lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count && + !overstriping) + stripe_count = + lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count; if (lo->ldo_is_composite) { struct lod_layout_component *lod_comp; @@ -1699,9 +1899,17 @@ __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo, lo->ldo_comp_cnt; for (i = 0; i < lo->ldo_comp_cnt; i++) { + unsigned int stripes; + + if (i == comp_idx) + continue; + lod_comp = &lo->ldo_comp_entries[i]; - comp_sz = lov_mds_md_size(lod_comp->llc_stripe_count, - LOV_MAGIC_V3); + /* Extension comp is never inited - 0 stripes on disk */ + stripes = lod_comp->llc_flags & LCME_FL_EXTENSION ? 0 : + lod_comp->llc_stripe_count; + + comp_sz = lov_mds_md_size(stripes, LOV_MAGIC_V3); total_comp_sz += comp_sz; if (lod_comp->llc_flags & LCME_FL_INIT) init_comp_sz += comp_sz; @@ -1719,6 +1927,7 @@ __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo, } max_stripes = lov_mds_md_max_stripe_count(easize, LOV_MAGIC_V3); + max_stripes = (max_stripes == 0) ? 0 : max_stripes - 1; return (stripe_count < max_stripes) ? stripe_count : max_stripes; } @@ -1902,7 +2111,7 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo, { struct lod_layout_component *lod_comp; struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev); - struct lov_desc *desc = &d->lod_desc; + struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc; struct lov_user_md_v1 *v1 = NULL; struct lov_user_md_v3 *v3 = NULL; struct lov_comp_md_v1 *comp_v1 = NULL; @@ -1928,7 +2137,7 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo, else lod_free_comp_entries(lo); - rc = lod_verify_striping(d, lo, buf, false); + rc = lod_verify_striping(env, d, lo, buf, false); if (rc) RETURN(-EINVAL); @@ -2029,13 +2238,17 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo, lod_comp->llc_extent = *ext; lod_comp->llc_flags = comp_v1->lcm_entries[i].lcme_flags & - LCME_USER_FLAGS; + LCME_CL_COMP_FLAGS; } pool_name = NULL; + if (def_pool[0] != '\0') + pool_name = def_pool; + if (v1->lmm_magic == LOV_USER_MAGIC_V3 || v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) { v3 = (struct lov_user_md_v3 *)v1; + if (v3->lmm_pool_name[0] != '\0') pool_name = v3->lmm_pool_name; @@ -2043,12 +2256,11 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo, rc = lod_comp_copy_ost_lists(lod_comp, v3); if (rc) GOTO(free_comp, rc); + + pool_name = NULL; } } - if (pool_name == NULL && def_pool[0] != '\0') - pool_name = def_pool; - if (v1->lmm_pattern == 0) v1->lmm_pattern = LOV_PATTERN_RAID0; if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0 && @@ -2061,21 +2273,25 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo, } lod_comp->llc_pattern = v1->lmm_pattern; - lod_comp->llc_stripe_size = desc->ld_default_stripe_size; - if (v1->lmm_stripe_size) - lod_comp->llc_stripe_size = v1->lmm_stripe_size; + lod_comp->llc_stripe_size = v1->lmm_stripe_size; + lod_adjust_stripe_size(lod_comp, desc->ld_default_stripe_size); lod_comp->llc_stripe_count = desc->ld_default_stripe_count; if (v1->lmm_stripe_count || lov_pattern(v1->lmm_pattern) == LOV_PATTERN_MDT) lod_comp->llc_stripe_count = v1->lmm_stripe_count; + if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT && + lod_comp->llc_stripe_count != 0) { + CDEBUG(D_LAYOUT, "%s: invalid stripe count: %u\n", + lod2obd(d)->obd_name, + lod_comp->llc_stripe_count); + GOTO(free_comp, rc = -EINVAL); + } + lod_comp->llc_stripe_offset = v1->lmm_stripe_offset; lod_obj_set_pool(lo, i, pool_name); - LASSERT(ergo(lov_pattern(lod_comp->llc_pattern) == - LOV_PATTERN_MDT, lod_comp->llc_stripe_count == 0)); - if (pool_name == NULL) continue; @@ -2118,34 +2334,34 @@ free_comp: int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo) { struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct lod_tgt_descs *ltds = &lod->lod_ost_descs; struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid; - struct cfs_bitmap *bitmap = NULL; + unsigned long *bitmap = NULL; __u32 *new_oss = NULL; - lag->lag_ost_avail = ltds->ltd_tgtnr; + lag->lag_ost_avail = lod->lod_ost_count; /* reset OSS avoid guide array */ lag->lag_oaa_count = 0; - if (lag->lag_oss_avoid_array && lag->lag_oaa_size < ltds->ltd_tgtnr) { - OBD_FREE(lag->lag_oss_avoid_array, - sizeof(__u32) * lag->lag_oaa_size); + if (lag->lag_oss_avoid_array && + lag->lag_oaa_size < lod->lod_ost_count) { + OBD_FREE_PTR_ARRAY(lag->lag_oss_avoid_array, lag->lag_oaa_size); lag->lag_oss_avoid_array = NULL; lag->lag_oaa_size = 0; } /* init OST avoid guide bitmap */ if (lag->lag_ost_avoid_bitmap) { - if (ltds->ltd_tgtnr <= lag->lag_ost_avoid_bitmap->size) { - CFS_RESET_BITMAP(lag->lag_ost_avoid_bitmap); + if (lod->lod_ost_count <= lag->lag_ost_avoid_size) { + bitmap_zero(lag->lag_ost_avoid_bitmap, + lag->lag_ost_avoid_size); } else { - CFS_FREE_BITMAP(lag->lag_ost_avoid_bitmap); + bitmap_free(lag->lag_ost_avoid_bitmap); lag->lag_ost_avoid_bitmap = NULL; } } if (!lag->lag_ost_avoid_bitmap) { - bitmap = CFS_ALLOCATE_BITMAP(ltds->ltd_tgtnr); + bitmap = bitmap_zalloc(lod->lod_ost_count, GFP_KERNEL); if (!bitmap) return -ENOMEM; } @@ -2157,19 +2373,21 @@ int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo) * using OST count to allocate the array to store the OSS * id. */ - OBD_ALLOC(new_oss, sizeof(*new_oss) * ltds->ltd_tgtnr); + OBD_ALLOC_PTR_ARRAY(new_oss, lod->lod_ost_count); if (!new_oss) { - CFS_FREE_BITMAP(bitmap); + bitmap_free(bitmap); return -ENOMEM; } } if (new_oss) { lag->lag_oss_avoid_array = new_oss; - lag->lag_oaa_size = ltds->ltd_tgtnr; + lag->lag_oaa_size = lod->lod_ost_count; } - if (bitmap) + if (bitmap) { lag->lag_ost_avoid_bitmap = bitmap; + lag->lag_ost_avoid_size = lod->lod_ost_count; + } return 0; } @@ -2183,7 +2401,7 @@ void lod_collect_avoidance(struct lod_object *lo, struct lod_avoid_guide *lag, { struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx]; - struct cfs_bitmap *bitmap = lag->lag_ost_avoid_bitmap; + unsigned long *bitmap = lag->lag_ost_avoid_bitmap; int i, j; /* iterate mirrors */ @@ -2230,12 +2448,12 @@ void lod_collect_avoidance(struct lod_object *lo, struct lod_avoid_guide *lag, ost = OST_TGT(lod, comp->llc_ost_indices[j]); lsq = ost->ltd_qos.ltq_svr; - if (cfs_bitmap_check(bitmap, ost->ltd_index)) + if (test_bit(ost->ltd_index, bitmap)) continue; QOS_DEBUG("OST%d used in conflicting mirror " "component\n", ost->ltd_index); - cfs_bitmap_set(bitmap, ost->ltd_index); + set_bit(ost->ltd_index, bitmap); lag->lag_ost_avail--; for (k = 0; k < lag->lag_oaa_count; k++) { @@ -2273,7 +2491,7 @@ void lod_collect_avoidance(struct lod_object *lo, struct lod_avoid_guide *lag, */ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, struct lu_attr *attr, struct thandle *th, - int comp_idx) + int comp_idx, __u64 reserve) { struct lod_layout_component *lod_comp; struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev); @@ -2288,6 +2506,7 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, LASSERT(lo); LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL); lod_comp = &lo->ldo_comp_entries[comp_idx]; + LASSERT(!(lod_comp->llc_flags & LCME_FL_EXTENSION)); /* A released component is being created */ if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) @@ -2306,8 +2525,8 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, * statfs and check OST targets now, since ld_active_tgt_count * could be changed if some OSTs are [de]activated manually. */ - lod_qos_statfs_update(env, d); - stripe_len = lod_get_stripe_count(d, lo, + lod_qos_statfs_update(env, d, &d->lod_ost_descs); + stripe_len = lod_get_stripe_count(d, lo, comp_idx, lod_comp->llc_stripe_count, lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING); @@ -2315,22 +2534,23 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, if (stripe_len == 0) GOTO(out, rc = -ERANGE); lod_comp->llc_stripe_count = stripe_len; - OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_len); + OBD_ALLOC_PTR_ARRAY(stripe, stripe_len); if (stripe == NULL) GOTO(out, rc = -ENOMEM); - OBD_ALLOC(ost_indices, sizeof(*ost_indices) * stripe_len); + OBD_ALLOC_PTR_ARRAY(ost_indices, stripe_len); if (!ost_indices) GOTO(out, rc = -ENOMEM); +repeat: lod_getref(&d->lod_ost_descs); /* XXX: support for non-0 files w/o objects */ CDEBUG(D_OTHER, "tgt_count %d stripe_count %d\n", - d->lod_desc.ld_tgt_count, stripe_len); + d->lod_ost_count, stripe_len); if (lod_comp->llc_ostlist.op_array && lod_comp->llc_ostlist.op_count) { rc = lod_alloc_ost_list(env, lo, stripe, ost_indices, - th, comp_idx); + th, comp_idx, reserve); } else if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT) { /** * collect OSTs and OSSs used in other mirrors whose @@ -2344,14 +2564,16 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, comp_idx); lod_collect_avoidance(lo, lag, comp_idx); - rc = lod_alloc_qos(env, lo, stripe, ost_indices, flag, - th, comp_idx); + rc = lod_ost_alloc_qos(env, lo, stripe, ost_indices, + flag, th, comp_idx, reserve); if (rc == -EAGAIN) - rc = lod_alloc_rr(env, lo, stripe, ost_indices, - flag, th, comp_idx); + rc = lod_ost_alloc_rr(env, lo, stripe, + ost_indices, flag, th, + comp_idx, reserve); } else { - rc = lod_alloc_specific(env, lo, stripe, ost_indices, - flag, th, comp_idx); + rc = lod_ost_alloc_specific(env, lo, stripe, + ost_indices, flag, th, + comp_idx, reserve); } put_ldts: lod_putref(d, &d->lod_ost_descs); @@ -2359,6 +2581,15 @@ put_ldts: for (i = 0; i < stripe_len; i++) if (stripe[i] != NULL) dt_object_put(env, stripe[i]); + + /* In case there is no space on any OST, let's ignore + * the @reserve space to avoid an error at the init + * time, probably the actual IO will be less than the + * given @reserve space (aka extension_size). */ + if (reserve) { + reserve = 0; + goto repeat; + } lod_comp->llc_stripe_count = 0; } else { lod_comp->llc_stripe = stripe; @@ -2395,10 +2626,9 @@ put_ldts: out: if (rc < 0) { if (stripe) - OBD_FREE(stripe, sizeof(stripe[0]) * stripe_len); + OBD_FREE_PTR_ARRAY(stripe, stripe_len); if (ost_indices) - OBD_FREE(ost_indices, - sizeof(*ost_indices) * stripe_len); + OBD_FREE_PTR_ARRAY(ost_indices, stripe_len); } RETURN(rc); } @@ -2419,7 +2649,7 @@ int lod_prepare_create(const struct lu_env *env, struct lod_object *lo, /* no OST available */ /* XXX: should we be waiting a bit to prevent failures during * cluster initialization? */ - if (d->lod_ostnr == 0) + if (!d->lod_ost_count) RETURN(-EIO); /* @@ -2450,7 +2680,7 @@ int lod_prepare_create(const struct lu_env *env, struct lod_object *lo, extent = &lod_comp->llc_extent; QOS_DEBUG("comp[%d] %lld "DEXT"\n", i, size, PEXT(extent)); if (!lo->ldo_is_composite || size >= extent->e_start) { - rc = lod_qos_prep_create(env, lo, attr, th, i); + rc = lod_qos_prep_create(env, lo, attr, th, i, 0); if (rc) break; }