4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License version 2 for more details. A copy is
14 * included in the COPYING file that accompanied this code.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved
24 * Use is subject to license terms.
26 * Copyright (c) 2012, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 * lustre/lod/lod_qos.c
33 * Implementation of different allocation algorithm used
34 * to distribute objects and data among OSTs.
37 #define DEBUG_SUBSYSTEM S_LOV
39 #include <asm/div64.h>
40 #include <linux/random.h>
42 #include <libcfs/libcfs.h>
43 #include <uapi/linux/lustre/lustre_idl.h>
44 #include <lustre_swab.h>
45 #include <obd_class.h>
47 #include "lod_internal.h"
49 /* check whether a target is available for new object allocation */
50 static inline int lod_statfs_check(struct lu_tgt_descs *ltd,
51 struct lu_tgt_desc *tgt)
53 struct obd_statfs *sfs = &tgt->ltd_statfs;
55 if (sfs->os_state & OS_STATFS_ENOSPC ||
56 (sfs->os_state & OS_STATFS_ENOINO &&
57 /* OST allocation allowed while precreated objects available */
58 (ltd->ltd_is_mdt || sfs->os_fprecreated == 0)))
61 /* If the OST is readonly then we can't allocate objects there */
62 if (sfs->os_state & OS_STATFS_READONLY)
65 /* object precreation is skipped on targets with max_create_count=0 */
66 if (sfs->os_state & OS_STATFS_NOPRECREATE)
73 * Check whether the target is available for new objects.
75 * Request statfs data from the given target and verify it's active and not
76 * read-only. If so, then it can be used to place new objects. This
77 * function also maintains the number of active/inactive targets and sets
78 * dirty flags if those numbers change so others can run re-balance procedures.
79 * No external locking is required.
81 * \param[in] env execution environment for this thread
82 * \param[in] d LOD device
83 * \param[in] ltd target table
84 * \param[in] tgt target
86 * \retval 0 if the target is good
87 * \retval negative negated errno on error
89 static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
90 struct lu_tgt_descs *ltd,
91 struct lu_tgt_desc *tgt, __u64 reserve)
93 struct obd_statfs_info info = { 0 };
94 struct lov_desc *desc = <d->ltd_lov_desc;
101 info.os_enable_pre = 1;
102 rc = dt_statfs_info(env, tgt->ltd_tgt, &tgt->ltd_statfs, &info);
103 if (rc && rc != -ENOTCONN)
104 CERROR("%s: statfs error: rc = %d\n", lod2obd(d)->obd_name, rc);
107 rc = lod_statfs_check(ltd, tgt);
109 /* reserving space shouldn't be enough to mark an OST inactive */
111 (reserve + (info.os_reserved_mb_low << 20) >
112 tgt->ltd_statfs.os_bavail * tgt->ltd_statfs.os_bsize))
115 /* check whether device has changed state (active, inactive) */
116 if (rc && tgt->ltd_active) {
117 /* turned inactive? */
118 spin_lock(&d->lod_lock);
119 if (tgt->ltd_active) {
124 LASSERT(desc->ld_active_tgt_count > 0);
125 desc->ld_active_tgt_count--;
126 set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags);
127 CDEBUG(D_CONFIG, "%s: turns inactive\n",
128 tgt->ltd_exp->exp_obd->obd_name);
130 spin_unlock(&d->lod_lock);
131 } else if (rc == 0 && !tgt->ltd_active) {
133 spin_lock(&d->lod_lock);
134 LASSERTF(desc->ld_active_tgt_count < desc->ld_tgt_count,
135 "active tgt count %d, tgt nr %d\n",
136 desc->ld_active_tgt_count, desc->ld_tgt_count);
137 if (!tgt->ltd_active) {
140 desc->ld_active_tgt_count++;
141 set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags);
142 CDEBUG(D_CONFIG, "%s: turns active\n",
143 tgt->ltd_exp->exp_obd->obd_name);
145 spin_unlock(&d->lod_lock);
147 if (rc == -ENOTCONN) {
148 /* In case that the ENOTCONN for inactive OST state is
149 * mistreated as MDT disconnection state by the client,
150 * this error should be changed to someone else.
159 * Maintain per-target statfs data.
161 * The function refreshes statfs data for all the targets every N seconds.
162 * The actual N is controlled via procfs and set to LOV_DESC_QOS_MAXAGE_DEFAULT
165 * \param[in] env execution environment for this thread
166 * \param[in] lod LOD device
167 * \param[in] ltd tgt table
169 void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod,
170 struct lu_tgt_descs *ltd)
172 struct obd_device *obd = lod2obd(lod);
173 struct lu_tgt_desc *tgt;
178 max_age = ktime_get_seconds() - 2 * ltd->ltd_lov_desc.ld_qos_maxage;
180 if (obd->obd_osfs_age > max_age)
181 /* statfs data are quite recent, don't need to refresh it */
184 if (test_and_set_bit(LQ_SF_PROGRESS, <d->ltd_qos.lq_flags))
187 if (obd->obd_osfs_age > max_age) {
188 /* statfs data are quite recent, don't need to refresh it */
189 clear_bit(LQ_SF_PROGRESS, <d->ltd_qos.lq_flags);
193 ltd_foreach_tgt(ltd, tgt) {
194 avail = tgt->ltd_statfs.os_bavail;
195 if (lod_statfs_and_check(env, lod, ltd, tgt, 0))
198 if (tgt->ltd_statfs.os_bavail != avail)
199 /* recalculate weigths */
200 set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags);
202 lod_putref(lod, ltd);
203 obd->obd_osfs_age = ktime_get_seconds();
205 clear_bit(LQ_SF_PROGRESS, <d->ltd_qos.lq_flags);
209 #define LOV_QOS_EMPTY ((__u32)-1)
212 * Calculate optimal round-robin order with regard to OSSes.
214 * Place all the OSTs from pool \a src_pool in a special array to be used for
215 * round-robin (RR) stripe allocation. The placement algorithm interleaves
216 * OSTs from the different OSSs so that RR allocation can balance OSSs evenly.
217 * Resorts the targets when the number of active targets changes (because of
218 * a new target or activation/deactivation).
220 * \param[in] lod LOD device
221 * \param[in] ltd tgt table
222 * \param[in] src_pool tgt pool
223 * \param[in] lqr round-robin list
225 * \retval 0 on success
226 * \retval -ENOMEM fails to allocate the array
228 static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_descs *ltd,
229 const struct lu_tgt_pool *src_pool,
230 struct lu_qos_rr *lqr)
232 struct lu_svr_qos *svr;
233 struct lu_tgt_desc *tgt;
234 unsigned placed, real_count;
239 if (!test_bit(LQ_DIRTY, &lqr->lqr_flags)) {
240 LASSERT(lqr->lqr_pool.op_size);
244 /* Do actual allocation. */
245 down_write(<d->ltd_qos.lq_rw_sem);
248 * Check again. While we were sleeping on @lq_rw_sem something could
251 if (!test_bit(LQ_DIRTY, &lqr->lqr_flags)) {
252 LASSERT(lqr->lqr_pool.op_size);
253 up_write(<d->ltd_qos.lq_rw_sem);
257 real_count = src_pool->op_count;
259 /* Zero the pool array */
260 /* alloc_rr is holding a read lock on the pool, so nobody is adding/
261 deleting from the pool. The lq_rw_sem insures that nobody else
263 lqr->lqr_pool.op_count = real_count;
264 rc = lu_tgt_pool_extend(&lqr->lqr_pool, real_count);
266 up_write(<d->ltd_qos.lq_rw_sem);
269 for (i = 0; i < lqr->lqr_pool.op_count; i++)
270 lqr->lqr_pool.op_array[i] = LOV_QOS_EMPTY;
272 /* Place all the tgts from 1 svr at the same time. */
274 list_for_each_entry(svr, <d->ltd_qos.lq_svr_list, lsq_svr_list) {
277 for (i = 0; i < lqr->lqr_pool.op_count; i++) {
280 if (!test_bit(src_pool->op_array[i],
281 ltd->ltd_tgt_bitmap))
284 tgt = LTD_TGT(ltd, src_pool->op_array[i]);
285 LASSERT(tgt && tgt->ltd_tgt);
286 if (tgt->ltd_qos.ltq_svr != svr)
289 /* Evenly space these tgts across arrayspace */
290 next = j * lqr->lqr_pool.op_count / svr->lsq_tgt_count;
291 while (lqr->lqr_pool.op_array[next] != LOV_QOS_EMPTY)
292 next = (next + 1) % lqr->lqr_pool.op_count;
294 lqr->lqr_pool.op_array[next] = src_pool->op_array[i];
300 clear_bit(LQ_DIRTY, &lqr->lqr_flags);
301 up_write(<d->ltd_qos.lq_rw_sem);
303 if (placed != real_count) {
304 /* This should never happen */
305 LCONSOLE_ERROR_MSG(0x14e, "Failed to place all tgts in the "
306 "round-robin list (%d of %d).\n",
308 for (i = 0; i < lqr->lqr_pool.op_count; i++) {
309 LCONSOLE(D_WARNING, "rr #%d tgt idx=%d\n", i,
310 lqr->lqr_pool.op_array[i]);
312 set_bit(LQ_DIRTY, &lqr->lqr_flags);
320 * Instantiate and declare creation of a new object.
322 * The function instantiates LU representation for a new object on the
323 * specified device. Also it declares an intention to create that
324 * object on the storage target.
326 * Note lu_object_anon() is used which is a trick with regard to LU/OSD
327 * infrastructure - in the existing precreation framework we can't assign FID
328 * at this moment, we do this later once a transaction is started. So the
329 * special method instantiates FID-less object in the cache and later it
330 * will get a FID and proper placement in LU cache.
332 * \param[in] env execution environment for this thread
333 * \param[in] d LOD device
334 * \param[in] ost_idx OST target index where the object is being created
335 * \param[in] th transaction handle
337 * \retval object ptr on success, ERR_PTR() otherwise
339 static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env,
340 struct lod_device *d,
345 struct dt_allocation_hint *ah = &lod_env_info(env)->lti_ah;
346 struct lod_tgt_desc *ost;
347 struct lu_object *o, *n;
348 struct lu_device *nd;
349 struct dt_object *dt;
354 LASSERT(ost_idx < d->lod_ost_descs.ltd_tgts_size);
355 ost = OST_TGT(d,ost_idx);
357 LASSERT(ost->ltd_tgt);
359 nd = &ost->ltd_tgt->dd_lu_dev;
362 * allocate anonymous object with zero fid, real fid
363 * will be assigned by OSP within transaction
364 * XXX: to be fixed with fully-functional OST fids
366 o = lu_object_anon(env, nd, NULL);
368 GOTO(out, dt = ERR_CAST(o));
370 n = lu_object_locate(o->lo_header, nd->ld_type);
371 if (unlikely(n == NULL)) {
372 CERROR("can't find slice\n");
373 lu_object_put(env, o);
374 GOTO(out, dt = ERR_PTR(-EINVAL));
377 dt = container_of(n, struct dt_object, do_lu);
379 ah->dah_can_block = can_block;
380 rc = lod_sub_declare_create(env, dt, NULL, ah, NULL, th);
382 CDEBUG(D_OTHER, "can't declare creation on #%u: %d\n",
384 lu_object_put(env, o);
393 * Calculate a minimum acceptable stripe count.
395 * Return an acceptable stripe count depending on flag LOD_USES_DEFAULT_STRIPE:
396 * all stripes or 3/4 of stripes. The code is written this way to avoid
397 * returning 0 for stripe_count < 4, like "stripe_count * 3 / 4" would do.
399 * \param[in] stripe_count number of stripes requested
400 * \param[in] flags 0 or LOD_USES_DEFAULT_STRIPE
402 * \retval acceptable stripecount
404 static int lod_stripe_count_min(__u32 stripe_count, enum lod_uses_hint flags)
406 return (flags & LOD_USES_DEFAULT_STRIPE ?
407 stripe_count - (stripe_count / 4) : stripe_count);
410 #define LOV_CREATE_RESEED_MULT 30
411 #define LOV_CREATE_RESEED_MIN 2000
414 * Initialize temporary tgt-in-use array.
416 * Allocate or extend the array used to mark targets already assigned to a new
417 * striping so they are not used more than once.
419 * \param[in] env execution environment for this thread
420 * \param[in] stripes number of items needed in the array
422 * \retval 0 on success
423 * \retval -ENOMEM on error
425 static inline int lod_qos_tgt_in_use_clear(const struct lu_env *env,
428 struct lod_thread_info *info = lod_env_info(env);
430 if (info->lti_ea_store_size < sizeof(int) * stripes)
431 lod_ea_store_resize(info, stripes * sizeof(int));
432 if (info->lti_ea_store_size < sizeof(int) * stripes) {
433 CERROR("can't allocate memory for tgt-in-use array\n");
436 memset(info->lti_ea_store, -1, sizeof(int) * stripes);
441 * Remember a target in the array of used targets.
443 * Mark the given target as used for a new striping being created. The status
444 * of an tgt in a striping can be checked with lod_qos_is_tgt_used().
446 * \param[in] env execution environment for this thread
447 * \param[in] idx index in the array
448 * \param[in] tgt_idx target index to mark as used
450 static inline void lod_qos_tgt_in_use(const struct lu_env *env,
451 int idx, int tgt_idx)
453 struct lod_thread_info *info = lod_env_info(env);
454 int *tgts = info->lti_ea_store;
456 LASSERT(info->lti_ea_store_size >= idx * sizeof(int));
461 * Check is tgt used in a striping.
463 * Checks whether tgt with the given index is marked as used in the temporary
464 * array (see lod_qos_tgt_in_use()).
466 * \param[in] env execution environment for this thread
467 * \param[in] tgt_idx target index to check
468 * \param[in] stripes the number of items used in the array already
473 static int lod_qos_is_tgt_used(const struct lu_env *env, int tgt_idx,
476 struct lod_thread_info *info = lod_env_info(env);
477 int *tgts = info->lti_ea_store;
480 for (j = 0; j < stripes; j++) {
481 if (tgts[j] == tgt_idx)
488 lod_obj_is_ost_use_skip_cb(const struct lu_env *env, struct lod_object *lo,
489 int comp_idx, struct lod_obj_stripe_cb_data *data)
491 struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
493 return comp->llc_ost_indices == NULL;
497 lod_obj_is_ost_use_cb(const struct lu_env *env, struct lod_object *lo,
498 int comp_idx, struct lod_obj_stripe_cb_data *data)
500 struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
503 for (i = 0; i < comp->llc_stripe_count; i++) {
504 if (comp->llc_ost_indices[i] == data->locd_ost_index) {
505 data->locd_ost_index = -1;
514 * Check is OST used in a composite layout
516 * \param[in] lo lod object
517 * \param[in] ost OST target index to check
519 * \retval false not used
522 static inline bool lod_comp_is_ost_used(const struct lu_env *env,
523 struct lod_object *lo, int ost)
525 struct lod_obj_stripe_cb_data data = { { 0 } };
527 data.locd_ost_index = ost;
528 data.locd_comp_skip_cb = lod_obj_is_ost_use_skip_cb;
529 data.locd_comp_cb = lod_obj_is_ost_use_cb;
531 (void)lod_obj_for_each_stripe(env, lo, NULL, &data);
533 return data.locd_ost_index == -1;
536 static inline void lod_avoid_update(struct lod_object *lo,
537 struct lod_avoid_guide *lag)
542 lag->lag_ost_avail--;
545 static inline bool lod_should_avoid_ost(struct lod_object *lo,
546 struct lod_avoid_guide *lag,
549 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
550 struct lod_tgt_desc *ost = OST_TGT(lod, index);
551 struct lu_svr_qos *lsq = ost->ltd_qos.ltq_svr;
555 if (!test_bit(index, lod->lod_ost_bitmap)) {
556 CDEBUG(D_OTHER, "OST%d: been used in conflicting mirror component\n",
562 * we've tried our best, all available OSTs have been used in
563 * overlapped components in the other mirror
565 if (lag->lag_ost_avail == 0)
569 for (i = 0; i < lag->lag_oaa_count; i++) {
570 if (lag->lag_oss_avoid_array[i] == lsq->lsq_id) {
576 * if the OSS which OST[index] resides has not been used, we'd like to
582 /* if the OSS has been used, check whether the OST has been used */
583 if (!test_bit(index, lag->lag_ost_avoid_bitmap))
586 CDEBUG(D_OTHER, "OST%d: been used in conflicting mirror component\n",
591 static int lod_check_and_reserve_ost(const struct lu_env *env,
592 struct lod_object *lo,
593 struct lod_layout_component *lod_comp,
594 __u32 ost_idx, __u32 speed, __u32 *s_idx,
595 struct dt_object **stripe,
601 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
602 struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
603 struct lu_tgt_desc *ost = OST_TGT(lod, ost_idx);
605 __u32 stripe_idx = *s_idx;
610 rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost, reserve);
615 * We expect number of precreated objects in f_ffree at
616 * the first iteration, skip OSPs with no objects ready
618 if (ost->ltd_statfs.os_fprecreated == 0 && speed == 0) {
619 CDEBUG(D_OTHER, "#%d: precreation is empty\n", ost_idx);
624 * try to use another OSP if this one is degraded
626 if (ost->ltd_statfs.os_state & OS_STATFS_DEGRADED && speed < 2) {
627 CDEBUG(D_OTHER, "#%d: degraded\n", ost_idx);
632 * try not allocate on OST which has been used by other
635 if (speed == 0 && lod_comp_is_ost_used(env, lo, ost_idx)) {
636 CDEBUG(D_OTHER, "iter %d: OST%d used by other component\n",
642 * try not allocate OSTs used by conflicting component of other mirrors
643 * for the first and second time.
645 if (speed < 2 && lod_should_avoid_ost(lo, lag, ost_idx)) {
646 CDEBUG(D_OTHER, "iter %d: OST%d used by conflicting mirror component\n",
651 /* do not put >1 objects on a single OST, except for overstriping */
652 if (lod_qos_is_tgt_used(env, ost_idx, stripe_idx)) {
653 if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
659 o = lod_qos_declare_object_on(env, lod, ost_idx, (speed > 1), th);
661 CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
662 ost_idx, (int) PTR_ERR(o));
668 * We've successfully declared (reserved) an object
670 lod_avoid_update(lo, lag);
671 lod_qos_tgt_in_use(env, stripe_idx, ost_idx);
672 stripe[stripe_idx] = o;
673 ost_indices[stripe_idx] = ost_idx;
674 CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_LOV_CREATE_RACE, 2);
682 * Allocate a striping using round-robin algorithm.
684 * Allocates a new striping using round-robin algorithm. The function refreshes
685 * all the internal structures (statfs cache, array of available OSTs sorted
686 * with regard to OSS, etc). The number of stripes required is taken from the
687 * object (must be prepared by the caller), but can change if the flag
688 * LOD_USES_DEFAULT_STRIPE is supplied. The caller should ensure nobody else
689 * is trying to create a striping on the object in parallel. All the internal
690 * structures (like pools, etc) are protected and no additional locking is
691 * required. The function succeeds even if a single stripe is allocated. To save
692 * time we give priority to targets which already have objects precreated.
693 * Full OSTs are skipped (see lod_qos_dev_is_full() for the details).
695 * \param[in] env execution environment for this thread
696 * \param[in] lo LOD object
697 * \param[out] stripe striping created
698 * \param[out] ost_indices ost indices of striping created
699 * \param[in] flags allocation flags (0 or LOD_USES_DEFAULT_STRIPE)
700 * \param[in] th transaction handle
701 * \param[in] comp_idx index of ldo_comp_entries
703 * \retval 0 on success
704 * \retval -ENOSPC if not enough OSTs are found
705 * \retval negative negated errno for other failures
707 static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo,
708 struct dt_object **stripe, __u32 *ost_indices,
709 enum lod_uses_hint flags, struct thandle *th,
710 int comp_idx, __u64 reserve)
712 struct lod_layout_component *lod_comp;
713 struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
714 struct pool_desc *pool = NULL;
715 struct lu_tgt_pool *osts;
716 struct lu_qos_rr *lqr;
717 unsigned int i, array_idx;
718 __u32 stripe_idx = 0;
719 __u32 stripe_count, stripe_count_min, ost_idx;
720 int rc, speed = 0, ost_connecting = 0;
721 int idx, stripes_per_ost = 1;
722 bool overstriped = false;
725 LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL);
726 lod_comp = &lo->ldo_comp_entries[comp_idx];
727 stripe_count = lod_comp->llc_stripe_count;
728 stripe_count_min = lod_stripe_count_min(stripe_count, flags);
730 if (lod_comp->llc_pool != NULL)
731 pool = lod_find_pool(m, lod_comp->llc_pool);
734 down_read(&pool_tgt_rw_sem(pool));
735 osts = &(pool->pool_obds);
736 lqr = &(pool->pool_rr);
738 osts = &m->lod_ost_descs.ltd_tgt_pool;
739 lqr = &(m->lod_ost_descs.ltd_qos.lq_rr);
742 rc = lod_qos_calc_rr(m, &m->lod_ost_descs, osts, lqr);
746 rc = lod_qos_tgt_in_use_clear(env, stripe_count);
750 down_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem);
751 spin_lock(&lqr->lqr_alloc);
752 if (--lqr->lqr_start_count <= 0) {
753 atomic_set(&lqr->lqr_start_idx,
754 get_random_u32_below(osts->op_count));
755 lqr->lqr_start_count =
756 (LOV_CREATE_RESEED_MIN / max(osts->op_count, 1U) +
757 LOV_CREATE_RESEED_MULT) * max(osts->op_count, 1U);
758 } else if (atomic_read(&lqr->lqr_start_idx) >= osts->op_count) {
759 /* If we have allocated from all of the tgts, slowly
760 * precess the next start OST if the tgt/stripe count
761 * difference isn't already doing this for us.
763 atomic_sub(osts->op_count, &lqr->lqr_start_idx);
764 if (stripe_count > 1 && (osts->op_count % stripe_count) != 1)
765 ++lqr->lqr_offset_idx;
767 spin_unlock(&lqr->lqr_alloc);
768 if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
770 (lod_comp->llc_stripe_count - 1) / osts->op_count + 1;
773 CDEBUG(D_OTHER, "pool '%s' want %d start_idx %d start_count %d offset %d active %d count %d\n",
774 lod_comp->llc_pool ? lod_comp->llc_pool : "",
775 stripe_count, atomic_read(&lqr->lqr_start_idx),
776 lqr->lqr_start_count, lqr->lqr_offset_idx, osts->op_count,
779 for (i = 0, idx = 0; i < osts->op_count * stripes_per_ost &&
780 stripe_idx < stripe_count; i++) {
781 if (likely(speed < 2) || i == 0)
782 idx = atomic_inc_return(&lqr->lqr_start_idx);
786 array_idx = (idx + lqr->lqr_offset_idx) %
788 ost_idx = lqr->lqr_pool.op_array[array_idx];
790 CDEBUG(D_OTHER, "#%d strt %d act %d strp %d ary %d idx %d\n",
791 i, idx, /* XXX: active*/ 0,
792 stripe_idx, array_idx, ost_idx);
794 if ((ost_idx == LOV_QOS_EMPTY) ||
795 !test_bit(ost_idx, m->lod_ost_bitmap))
798 /* Fail Check before osc_precreate() is called
799 so we can only 'fail' single OSC. */
800 if (CFS_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
803 if (CFS_FAIL_PRECHECK(OBD_FAIL_MDS_LOD_CREATE_PAUSE)) {
804 clear_bit(LQ_SAME_SPACE,
805 &m->lod_ost_descs.ltd_qos.lq_flags);
806 CFS_FAIL_TIMEOUT(OBD_FAIL_MDS_LOD_CREATE_PAUSE,
809 rc = lod_check_and_reserve_ost(env, lo, lod_comp, ost_idx,
810 speed, &stripe_idx, stripe,
811 ost_indices, th, &overstriped,
814 if (rc != 0 && OST_TGT(m, ost_idx)->ltd_discon)
817 if ((speed < 2) && (stripe_idx < stripe_count_min)) {
818 /* Try again, allowing slower OSCs */
824 up_read(&m->lod_ost_descs.ltd_qos.lq_rw_sem);
826 /* If there are enough OSTs, a component with overstriping requested
827 * will not actually end up overstriped. The comp should reflect this.
830 lod_comp->llc_pattern &= ~LOV_PATTERN_OVERSTRIPING;
833 lod_comp->llc_stripe_count = stripe_idx;
834 /* at least one stripe is allocated */
837 /* nobody provided us with a single object */
846 up_read(&pool_tgt_rw_sem(pool));
847 /* put back ref got by lod_find_pool() */
848 lod_pool_putref(pool);
855 lod_qos_mdt_in_use_init(const struct lu_env *env,
856 const struct lu_tgt_descs *ltd,
857 u32 stripe_idx, u32 stripe_count,
858 const struct lu_tgt_pool *pool,
859 struct dt_object **stripes)
862 struct lu_tgt_desc *mdt;
866 rc = lod_qos_tgt_in_use_clear(env, stripe_count);
870 /* if stripe_idx > 1, we are splitting directory, mark existing stripes
871 * in_use. Because for either split or creation, stripe 0 is local,
872 * don't mark it in use.
874 for (i = 1; i < stripe_idx; i++) {
876 for (j = 0; j < pool->op_count; j++) {
877 mdt_idx = pool->op_array[j];
879 if (!test_bit(mdt_idx, ltd->ltd_tgt_bitmap))
882 mdt = LTD_TGT(ltd, mdt_idx);
883 if (&mdt->ltd_tgt->dd_lu_dev ==
884 stripes[i]->do_lu.lo_dev)
885 lod_qos_tgt_in_use(env, i, mdt_idx);
893 * Allocate a striping using round-robin algorithm.
895 * Allocates a new striping using round-robin algorithm. The function refreshes
896 * all the internal structures (statfs cache, array of available remote MDTs
897 * sorted with regard to MDS, etc). The number of stripes required is taken from
898 * the object (must be prepared by the caller). The caller should ensure nobody
899 * else is trying to create a striping on the object in parallel. All the
900 * internal structures (like pools, etc) are protected and no additional locking
901 * is required. The function succeeds even if a single stripe is allocated.
903 * \param[in] env execution environment for this thread
904 * \param[in] lo LOD object
905 * \param[out] stripes striping created
907 * \retval positive stripe objects allocated, including the first stripe
909 * \retval -ENOSPC if not enough MDTs are found
910 * \retval negative negated errno for other failures
912 int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo,
913 struct dt_object **stripes, u32 stripe_idx,
916 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
917 struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
918 struct lu_tgt_pool *pool;
919 struct lu_qos_rr *lqr;
920 struct lu_tgt_desc *mdt;
921 struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
922 struct lu_fid fid = { 0 };
923 struct dt_object *dto;
924 unsigned int pool_idx;
926 u32 saved_idx = stripe_idx;
928 bool use_degraded = false;
929 int tgt_connecting = 0;
934 pool = <d->ltd_tgt_pool;
935 lqr = <d->ltd_qos.lq_rr;
936 rc = lod_qos_calc_rr(lod, ltd, pool, lqr);
940 rc = lod_qos_mdt_in_use_init(env, ltd, stripe_idx, stripe_count, pool,
945 down_read(<d->ltd_qos.lq_rw_sem);
946 spin_lock(&lqr->lqr_alloc);
947 if (--lqr->lqr_start_count <= 0) {
948 atomic_set(&lqr->lqr_start_idx,
949 get_random_u32_below(pool->op_count));
950 lqr->lqr_start_count =
951 (LOV_CREATE_RESEED_MIN / max(pool->op_count, 1U) +
952 LOV_CREATE_RESEED_MULT) * max(pool->op_count, 1U);
953 } else if (atomic_read(&lqr->lqr_start_idx) >= pool->op_count) {
954 /* If we have allocated from all of the tgts, slowly
955 * precess the next start if the tgt/stripe count isn't
956 * already doing this for us. */
957 atomic_sub(pool->op_count, &lqr->lqr_start_idx);
958 if (stripe_count - 1 > 1 &&
959 (pool->op_count % (stripe_count - 1)) != 1)
960 ++lqr->lqr_offset_idx;
962 spin_unlock(&lqr->lqr_alloc);
965 CDEBUG(D_OTHER, "want=%d start_idx=%d start_count=%d offset=%d active=%d count=%d\n",
966 stripe_count - 1, atomic_read(&lqr->lqr_start_idx),
967 lqr->lqr_start_count, lqr->lqr_offset_idx, pool->op_count,
970 for (i = 0; i < pool->op_count && stripe_idx < stripe_count; i++) {
973 idx = atomic_inc_return(&lqr->lqr_start_idx);
974 pool_idx = (idx + lqr->lqr_offset_idx) %
976 mdt_idx = lqr->lqr_pool.op_array[pool_idx];
977 mdt = LTD_TGT(ltd, mdt_idx);
979 CDEBUG(D_OTHER, "#%d strt %d act %d strp %d ary %d idx %d\n",
980 i, idx, /* XXX: active*/ 0,
981 stripe_idx, pool_idx, mdt_idx);
983 if (mdt_idx == LOV_QOS_EMPTY ||
984 !test_bit(mdt_idx, ltd->ltd_tgt_bitmap))
987 /* do not put >1 objects on one MDT */
988 if (lod_qos_is_tgt_used(env, mdt_idx, stripe_idx))
991 if (mdt->ltd_discon) {
996 if (lod_statfs_check(ltd, mdt))
999 /* try to use another OSP if this one is degraded */
1000 if (mdt->ltd_statfs.os_state & OS_STATFS_DEGRADED &&
1002 CDEBUG(D_OTHER, "#%d: degraded\n", mdt_idx);
1006 rc = dt_fid_alloc(env, mdt->ltd_tgt, &fid, NULL, NULL);
1008 CDEBUG(D_OTHER, "#%d: alloc FID failed: %dl\n", mdt_idx, rc);
1012 dto = dt_locate_at(env, mdt->ltd_tgt, &fid,
1013 lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
1017 CDEBUG(D_OTHER, "can't alloc stripe on #%u: %d\n",
1018 mdt->ltd_index, (int) PTR_ERR(dto));
1020 if (mdt->ltd_discon)
1025 lod_qos_tgt_in_use(env, stripe_idx, mdt_idx);
1026 stripes[stripe_idx++] = dto;
1029 if (!use_degraded && stripe_idx < stripe_count) {
1030 /* Try again, allowing slower MDTs */
1031 use_degraded = true;
1036 up_read(<d->ltd_qos.lq_rw_sem);
1038 if (stripe_idx > saved_idx)
1039 /* at least one stripe is allocated */
1042 /* nobody provided us with a single object */
1044 RETURN(-EINPROGRESS);
1050 * Allocate a specific striping layout on a user defined set of OSTs.
1052 * Allocates new striping using the OST index range provided by the data from
1053 * the lmm_obejcts contained in the lov_user_md passed to this method. Full
1054 * OSTs are not considered. The exact order of OSTs requested by the user
1055 * is respected as much as possible depending on OST status. The number of
1056 * stripes needed and stripe offset are taken from the object. If that number
1057 * can not be met, then the function returns a failure and then it's the
1058 * caller's responsibility to release the stripes allocated. All the internal
1059 * structures are protected, but no concurrent allocation is allowed on the
1062 * \param[in] env execution environment for this thread
1063 * \param[in] lo LOD object
1064 * \param[out] stripe striping created
1065 * \param[out] ost_indices ost indices of striping created
1066 * \param[in] th transaction handle
1067 * \param[in] comp_idx index of ldo_comp_entries
1069 * \retval 0 on success
1070 * \retval -ENODEV OST index does not exist on file system
1071 * \retval -EINVAL requested OST index is invalid
1072 * \retval negative negated errno on error
1074 static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
1075 struct dt_object **stripe, __u32 *ost_indices,
1076 struct thandle *th, int comp_idx, __u64 reserve)
1078 struct lod_layout_component *lod_comp;
1079 struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1080 struct dt_object *o;
1081 unsigned int array_idx = 0;
1082 int stripe_count = 0;
1087 /* for specific OSTs layout */
1088 LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL);
1089 lod_comp = &lo->ldo_comp_entries[comp_idx];
1090 LASSERT(lod_comp->llc_ostlist.op_array);
1091 LASSERT(lod_comp->llc_ostlist.op_count);
1093 rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count);
1097 if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT)
1098 lod_comp->llc_stripe_offset =
1099 lod_comp->llc_ostlist.op_array[0];
1101 for (i = 0; i < lod_comp->llc_stripe_count; i++) {
1102 if (lod_comp->llc_ostlist.op_array[i] ==
1103 lod_comp->llc_stripe_offset) {
1108 if (i == lod_comp->llc_stripe_count) {
1110 "%s: start index %d not in the specified list of OSTs\n",
1111 lod2obd(m)->obd_name, lod_comp->llc_stripe_offset);
1115 for (i = 0; i < lod_comp->llc_stripe_count;
1116 i++, array_idx = (array_idx + 1) % lod_comp->llc_stripe_count) {
1117 __u32 ost_idx = lod_comp->llc_ostlist.op_array[array_idx];
1119 if (!test_bit(ost_idx, m->lod_ost_bitmap)) {
1124 /* do not put >1 objects on a single OST, except for
1127 if (lod_qos_is_tgt_used(env, ost_idx, stripe_count) &&
1128 !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)) {
1133 rc = lod_statfs_and_check(env, m, &m->lod_ost_descs,
1134 LTD_TGT(&m->lod_ost_descs, ost_idx),
1136 if (rc < 0) /* this OSP doesn't feel well */
1139 o = lod_qos_declare_object_on(env, m, ost_idx, true, th);
1143 "%s: can't declare new object on #%u: %d\n",
1144 lod2obd(m)->obd_name, ost_idx, rc);
1149 * We've successfully declared (reserved) an object
1151 lod_qos_tgt_in_use(env, stripe_count, ost_idx);
1152 stripe[stripe_count] = o;
1153 ost_indices[stripe_count] = ost_idx;
1161 * Allocate a striping on a predefined set of OSTs.
1163 * Allocates new layout starting from OST index in lo->ldo_stripe_offset.
1164 * Full OSTs are not considered. The exact order of OSTs is not important and
1165 * varies depending on OST status. The allocation procedure prefers the targets
1166 * with precreated objects ready. The number of stripes needed and stripe
1167 * offset are taken from the object. If that number cannot be met, then the
1168 * function returns an error and then it's the caller's responsibility to
1169 * release the stripes allocated. All the internal structures are protected,
1170 * but no concurrent allocation is allowed on the same objects.
1172 * \param[in] env execution environment for this thread
1173 * \param[in] lo LOD object
1174 * \param[out] stripe striping created
1175 * \param[out] ost_indices ost indices of striping created
1176 * \param[in] flags not used
1177 * \param[in] th transaction handle
1178 * \param[in] comp_idx index of ldo_comp_entries
1180 * \retval 0 on success
1181 * \retval -ENOSPC if no OST objects are available at all
1182 * \retval -EFBIG if not enough OST objects are found
1183 * \retval -EINVAL requested offset is invalid
1184 * \retval negative errno on failure
1186 static int lod_ost_alloc_specific(const struct lu_env *env,
1187 struct lod_object *lo,
1188 struct dt_object **stripe, __u32 *ost_indices,
1189 enum lod_uses_hint flags, struct thandle *th,
1190 int comp_idx, __u64 reserve)
1192 struct lod_layout_component *lod_comp;
1193 struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1194 struct dt_object *o;
1195 struct lu_tgt_desc *tgt;
1197 unsigned int i, array_idx, ost_count;
1198 int rc, stripe_num = 0;
1200 struct pool_desc *pool = NULL;
1201 struct lu_tgt_pool *osts;
1202 int stripes_per_ost = 1;
1203 bool overstriped = false;
1206 LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL);
1207 lod_comp = &lo->ldo_comp_entries[comp_idx];
1209 rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count);
1213 if (lod_comp->llc_pool != NULL)
1214 pool = lod_find_pool(m, lod_comp->llc_pool);
1217 down_read(&pool_tgt_rw_sem(pool));
1218 osts = &(pool->pool_obds);
1220 osts = &m->lod_ost_descs.ltd_tgt_pool;
1223 ost_count = osts->op_count;
1226 /* search loi_ost_idx in ost array */
1228 for (i = 0; i < ost_count; i++) {
1229 if (osts->op_array[i] == lod_comp->llc_stripe_offset) {
1234 if (i == ost_count) {
1235 CERROR("Start index %d not found in pool '%s'\n",
1236 lod_comp->llc_stripe_offset,
1237 lod_comp->llc_pool ? lod_comp->llc_pool : "");
1238 GOTO(out, rc = -EINVAL);
1241 if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
1243 (lod_comp->llc_stripe_count - 1)/ost_count + 1;
1245 /* user specifies bigger stripe count than available ost count */
1246 if (lod_comp->llc_stripe_count > ost_count * stripes_per_ost)
1247 lod_comp->llc_stripe_count = ost_count * stripes_per_ost;
1249 for (i = 0; i < ost_count * stripes_per_ost;
1250 i++, array_idx = (array_idx + 1) % ost_count) {
1251 ost_idx = osts->op_array[array_idx];
1253 if (!test_bit(ost_idx, m->lod_ost_bitmap))
1256 /* Fail Check before osc_precreate() is called
1257 so we can only 'fail' single OSC. */
1258 if (CFS_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
1262 * do not put >1 objects on a single OST, except for
1263 * overstriping, where it is intended
1265 if (lod_qos_is_tgt_used(env, ost_idx, stripe_num)) {
1266 if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
1273 * try not allocate on the OST used by other component
1275 if (speed == 0 && i != 0 &&
1276 lod_comp_is_ost_used(env, lo, ost_idx))
1279 tgt = LTD_TGT(&m->lod_ost_descs, ost_idx);
1281 /* Drop slow OSCs if we can, but not for requested start idx.
1283 * This means "if OSC is slow and it is not the requested
1284 * start OST, then it can be skipped, otherwise skip it only
1285 * if it is inactive/recovering/out-of-space." */
1287 rc = lod_statfs_and_check(env, m, &m->lod_ost_descs,
1290 /* this OSP doesn't feel well */
1295 * We expect number of precreated objects at the first
1296 * iteration. Skip OSPs with no objects ready. Don't apply
1297 * this logic to OST specified with stripe_offset.
1299 if (i && !tgt->ltd_statfs.os_fprecreated && !speed)
1302 o = lod_qos_declare_object_on(env, m, ost_idx, true, th);
1304 CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
1305 ost_idx, (int) PTR_ERR(o));
1310 * We've successfully declared (reserved) an object
1312 lod_qos_tgt_in_use(env, stripe_num, ost_idx);
1313 stripe[stripe_num] = o;
1314 ost_indices[stripe_num] = ost_idx;
1317 /* We have enough stripes */
1318 if (stripe_num == lod_comp->llc_stripe_count)
1322 /* Try again, allowing slower OSCs */
1327 /* If we were passed specific striping params, then a failure to
1328 * meet those requirements is an error, since we can't reallocate
1329 * that memory (it might be part of a larger array or something).
1331 CERROR("can't lstripe objid "DFID": have %d want %u\n",
1332 PFID(lu_object_fid(lod2lu_obj(lo))), stripe_num,
1333 lod_comp->llc_stripe_count);
1334 rc = stripe_num == 0 ? -ENOSPC : -EFBIG;
1336 /* If there are enough OSTs, a component with overstriping requessted
1337 * will not actually end up overstriped. The comp should reflect this.
1339 if (rc == 0 && !overstriped)
1340 lod_comp->llc_pattern &= ~LOV_PATTERN_OVERSTRIPING;
1344 up_read(&pool_tgt_rw_sem(pool));
1345 /* put back ref got by lod_find_pool() */
1346 lod_pool_putref(pool);
1352 #ifdef HAVE_DOWN_WRITE_KILLABLE
1353 struct semaphore_timer {
1354 struct timer_list timer;
1355 struct task_struct *task;
1358 static void process_semaphore_timer(struct timer_list *t)
1360 struct semaphore_timer *timeout = cfs_from_timer(timeout, t, timer);
1362 send_sig(SIGKILL, timeout->task, 1);
1367 * Calculate penalties per-ost in a pool
1369 * The algorithm is similar to ltd_qos_penalties_calc(), but much simpler,
1370 * just considering the space of each OST in this pool.
1372 * \param[in] lod lod_device
1373 * \param[in] pool pool_desc
1375 * \retval 0 on success
1376 * \retval -EAGAIN the number of OSTs isn't enough or all tgt spaces are
1379 static int lod_pool_qos_penalties_calc(struct lod_device *lod,
1380 struct pool_desc *pool)
1382 struct lu_tgt_descs *ltd = &lod->lod_ost_descs;
1383 struct lu_qos *qos = <d->ltd_qos;
1384 struct lov_desc *desc = <d->ltd_lov_desc;
1385 struct lu_tgt_pool *osts = &pool->pool_obds;
1386 struct lod_tgt_desc *ost;
1387 __u64 ba_max, ba_min, ba;
1395 now = ktime_get_real_seconds();
1397 if (pool->pool_same_space && now < pool->pool_same_space_expire)
1400 num_active = osts->op_count - 1;
1402 GOTO(out, rc = -EAGAIN);
1404 prio_wide = 256 - qos->lq_prio_free;
1406 ba_min = (__u64)(-1);
1409 /* Calculate penalty per OST */
1410 for (i = 0; i < osts->op_count; i++) {
1411 if (!test_bit(osts->op_array[i], lod->lod_ost_bitmap))
1414 ost = OST_TGT(lod, osts->op_array[i]);
1415 if (!ost->ltd_active)
1418 ba = tgt_statfs_bavail(ost) >> 8;
1422 ba_min = min(ba, ba_min);
1423 ba_max = max(ba, ba_max);
1424 ost->ltd_qos.ltq_svr->lsq_bavail += ba;
1427 * per-ost penalty is
1428 * prio * bavail / (num_tgt - 1) / prio_max / 2
1430 ost->ltd_qos.ltq_penalty_per_obj = prio_wide * ba >> 9;
1431 do_div(ost->ltd_qos.ltq_penalty_per_obj, num_active);
1433 age = (now - ost->ltd_qos.ltq_used) >> 3;
1434 if (age > 32 * desc->ld_qos_maxage)
1435 ost->ltd_qos.ltq_penalty = 0;
1436 else if (age > desc->ld_qos_maxage)
1437 /* Decay ost penalty. */
1438 ost->ltd_qos.ltq_penalty >>= age / desc->ld_qos_maxage;
1442 * If each ost has almost same free space, do rr allocation for better
1443 * creation performance
1445 if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min) {
1446 pool->pool_same_space = true;
1447 pool->pool_same_space_expire = now + desc->ld_qos_maxage;
1449 pool->pool_same_space = false;
1454 if (!rc && pool->pool_same_space)
1461 * Allocate a striping using an algorithm with weights.
1463 * The function allocates OST objects to create a striping. The algorithm
1464 * used is based on weights (currently only using the free space), and it's
1465 * trying to ensure the space is used evenly by OSTs and OSSs. The striping
1466 * configuration (# of stripes, offset, pool) is taken from the object and
1467 * is prepared by the caller.
1469 * If LOD_USES_DEFAULT_STRIPE is not passed and prepared configuration can't
1470 * be met due to too few OSTs, then allocation fails. If the flag is passed
1471 * fewer than 3/4 of the requested number of stripes can be allocated, then
1474 * No concurrent allocation is allowed on the object and this must be ensured
1475 * by the caller. All the internal structures are protected by the function.
1477 * The algorithm has two steps: find available OSTs and calculate their
1478 * weights, then select the OSTs with their weights used as the probability.
1479 * An OST with a higher weight is proportionately more likely to be selected
1480 * than one with a lower weight.
1482 * \param[in] env execution environment for this thread
1483 * \param[in] lo LOD object
1484 * \param[out] stripe striping created
1485 * \param[out] ost_indices ost indices of striping created
1486 * \param[in] flags 0 or LOD_USES_DEFAULT_STRIPE
1487 * \param[in] th transaction handle
1488 * \param[in] comp_idx index of ldo_comp_entries
1490 * \retval 0 on success
1491 * \retval -EAGAIN not enough OSTs are found for specified stripe count
1492 * \retval -EINVAL requested OST index is invalid
1493 * \retval negative errno on failure
1495 static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo,
1496 struct dt_object **stripe, __u32 *ost_indices,
1497 enum lod_uses_hint flags, struct thandle *th,
1498 int comp_idx, __u64 reserve)
1500 struct lod_layout_component *lod_comp;
1501 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1502 struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
1503 struct lod_tgt_desc *ost;
1504 struct dt_object *o;
1505 __u64 total_weight = 0;
1506 struct pool_desc *pool = NULL;
1507 struct lu_tgt_pool *osts;
1509 __u32 nfound, good_osts, stripe_count, stripe_count_min;
1510 bool overstriped = false;
1511 int stripes_per_ost = 1;
1516 /* Totally skip qos part when qos_threshold_rr=100% */
1517 if (lod->lod_ost_descs.ltd_qos.lq_threshold_rr == QOS_THRESHOLD_MAX)
1520 LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL);
1521 lod_comp = &lo->ldo_comp_entries[comp_idx];
1522 stripe_count = lod_comp->llc_stripe_count;
1523 stripe_count_min = lod_stripe_count_min(stripe_count, flags);
1524 if (stripe_count_min < 1)
1527 if (lod_comp->llc_pool != NULL)
1528 pool = lod_find_pool(lod, lod_comp->llc_pool);
1531 down_read(&pool_tgt_rw_sem(pool));
1532 osts = &(pool->pool_obds);
1534 osts = &lod->lod_ost_descs.ltd_tgt_pool;
1537 /* Detect -EAGAIN early, before expensive lock is taken. */
1538 if (!ltd_qos_is_usable(&lod->lod_ost_descs))
1539 GOTO(out_nolock, rc = -EAGAIN);
1541 if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
1543 (lod_comp->llc_stripe_count - 1)/osts->op_count + 1;
1545 #ifdef HAVE_DOWN_WRITE_KILLABLE
1546 if (!down_write_trylock(&lod->lod_ost_descs.ltd_qos.lq_rw_sem)) {
1547 struct semaphore_timer timer;
1549 kernel_sigaction(SIGKILL, SIG_DFL);
1550 timer.task = current;
1551 cfs_timer_setup(&timer.timer, process_semaphore_timer, 0, 0);
1552 mod_timer(&timer.timer, jiffies + cfs_time_seconds(2));
1553 /* Do actual allocation, use write lock here. */
1554 rc = down_write_killable(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
1556 timer_delete_sync(&timer.timer);
1557 kernel_sigaction(SIGKILL, SIG_IGN);
1559 flush_signals(current);
1560 CDEBUG(D_OTHER, "%s: wakeup semaphore on timeout rc = %d\n",
1561 lod2obd(lod)->obd_name, rc);
1562 GOTO(out_nolock, rc = -EAGAIN);
1566 /* Do actual allocation, use write lock here. */
1567 down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
1570 * Check again, while we were sleeping on @lq_rw_sem things could
1573 if (!ltd_qos_is_usable(&lod->lod_ost_descs))
1574 GOTO(out, rc = -EAGAIN);
1577 rc = lod_pool_qos_penalties_calc(lod, pool);
1579 rc = ltd_qos_penalties_calc(&lod->lod_ost_descs);
1583 rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count);
1588 /* Find all the OSTs that are valid stripe candidates */
1589 for (i = 0; i < osts->op_count; i++) {
1590 if (!test_bit(osts->op_array[i], lod->lod_ost_bitmap))
1593 ost = OST_TGT(lod, osts->op_array[i]);
1594 ost->ltd_qos.ltq_usable = 0;
1596 rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs,
1599 /* this OSP doesn't feel well */
1603 if (ost->ltd_statfs.os_state & OS_STATFS_DEGRADED)
1606 /* Fail Check before osc_precreate() is called
1607 * so we can only 'fail' single OSC.
1609 if (CFS_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) &&
1610 osts->op_array[i] == 0)
1613 ost->ltd_qos.ltq_usable = 1;
1614 lu_tgt_qos_weight_calc(ost, false);
1615 total_weight += ost->ltd_qos.ltq_weight;
1620 CDEBUG(D_OTHER, "found %d good osts\n", good_osts);
1622 if (good_osts < stripe_count_min)
1623 GOTO(out, rc = -EAGAIN);
1625 /* If we do not have enough OSTs for the requested stripe count, do not
1626 * put more stripes per OST than requested.
1628 if (stripe_count / stripes_per_ost > good_osts)
1629 stripe_count = good_osts * stripes_per_ost;
1631 /* Find enough OSTs with weighted random allocation. */
1633 while (nfound < stripe_count) {
1634 u64 rand, cur_weight;
1639 rand = lu_prandom_u64_max(total_weight);
1641 /* On average, this will hit larger-weighted OSTs more often.
1642 * 0-weight OSTs will always get used last (only when rand=0)
1644 for (i = 0; i < osts->op_count; i++) {
1645 __u32 idx = osts->op_array[i];
1646 struct lod_tgt_desc *ost;
1648 if (lod_should_avoid_ost(lo, lag, idx))
1651 ost = OST_TGT(lod, idx);
1653 if (!ost->ltd_qos.ltq_usable)
1656 cur_weight += ost->ltd_qos.ltq_weight;
1657 CDEBUG(D_OTHER, "stripe_count=%d nfound=%d cur_weight=%llu rand=%llu total_weight=%llu\n",
1658 stripe_count, nfound, cur_weight, rand,
1661 if (cur_weight < rand)
1664 CDEBUG(D_OTHER, "stripe=%d to idx=%d\n", nfound, idx);
1666 * In case of QOS it makes sense to check components
1667 * only for FLR and if current component doesn't support
1670 if (lo->ldo_mirror_count > 1 &&
1671 !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)
1672 && lod_comp_is_ost_used(env, lo, idx))
1675 if (lod_qos_is_tgt_used(env, idx, nfound)) {
1676 if (lod_comp->llc_pattern &
1677 LOV_PATTERN_OVERSTRIPING)
1683 o = lod_qos_declare_object_on(env, lod, idx, slow, th);
1685 CDEBUG(D_OTHER, "can't declare object on #%u: %d\n",
1686 idx, (int) PTR_ERR(o));
1690 lod_avoid_update(lo, lag);
1691 lod_qos_tgt_in_use(env, nfound, idx);
1693 ost_indices[nfound] = idx;
1694 ltd_qos_update(&lod->lod_ost_descs, ost, &total_weight);
1700 if (rc && !slow && nfound < stripe_count) {
1701 /* couldn't allocate using precreated objects
1702 * so try to wait for new precreations */
1708 /* no OST found on this iteration, give up */
1713 if (unlikely(nfound < stripe_count_min)) {
1715 * when the decision to use weighted algorithm was made
1716 * we had enough appropriate OSPs, but this state can
1717 * change anytime (no space on OST, broken connection, etc)
1718 * so it's possible OSP won't be able to provide us with
1719 * an object due to just changed state
1721 CDEBUG(D_OTHER, "%s: wanted %d objects, found only %d\n",
1722 lod2obd(lod)->obd_name, stripe_count, nfound);
1723 for (i = 0; i < nfound; i++) {
1724 LASSERT(stripe[i] != NULL);
1725 dt_object_put(env, stripe[i]);
1729 /* makes sense to rebalance next time */
1730 set_bit(LQ_DIRTY, &lod->lod_ost_descs.ltd_qos.lq_flags);
1731 clear_bit(LQ_SAME_SPACE, &lod->lod_ost_descs.ltd_qos.lq_flags);
1735 /* If there are enough OSTs, a component with overstriping requessted
1736 * will not actually end up overstriped. The comp should reflect this.
1738 if (rc == 0 && !overstriped)
1739 lod_comp->llc_pattern &= ~LOV_PATTERN_OVERSTRIPING;
1742 up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem);
1746 up_read(&pool_tgt_rw_sem(pool));
1747 /* put back ref got by lod_find_pool() */
1748 lod_pool_putref(pool);
1755 * Allocate a striping using an algorithm with weights.
1757 * The function allocates remote MDT objects to create a striping, the first
1758 * object was already allocated on current MDT to ensure master object and
1759 * the first object are on the same MDT. The algorithm used is based on weights
1760 * (both free space and inodes), and it's trying to ensure the space/inodes are
1761 * used evenly by MDTs and MDSs. The striping configuration (# of stripes,
1762 * offset, pool) is taken from the object and is prepared by the caller.
1764 * If prepared configuration can't be met due to too few MDTs, then allocation
1767 * No concurrent allocation is allowed on the object and this must be ensured
1768 * by the caller. All the internal structures are protected by the function.
1770 * The algorithm has two steps: find available MDTs and calculate their
1771 * weights, then select the MDTs with their weights used as the probability.
1772 * An MDT with a higher weight is proportionately more likely to be selected
1773 * than one with a lower weight.
1775 * \param[in] env execution environment for this thread
1776 * \param[in] lo LOD object
1777 * \param[in] stripe_idx starting stripe index to allocate, if it's not
1778 * 0, we are restriping directory
1779 * \param[in] stripe_count total stripe count
1780 * \param[out] stripes striping created
1782 * \retval positive stripes allocated, and it should be equal to
1783 * lo->ldo_dir_stripe_count
1784 * \retval -EAGAIN not enough tgts are found for specified stripe count
1785 * \retval -EINVAL requested MDT index is invalid
1786 * \retval negative errno on failure
1788 int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo,
1789 struct dt_object **stripes, u32 stripe_idx,
1792 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
1793 struct lu_tgt_descs *ltd = &lod->lod_mdt_descs;
1794 struct lu_object_conf conf = { .loc_flags = LOC_F_NEW };
1795 struct lu_fid fid = { 0 };
1796 const struct lu_tgt_pool *pool;
1797 struct lu_tgt_desc *mdt;
1798 struct dt_object *dto;
1799 u64 total_weight = 0;
1800 u32 saved_idx = stripe_idx;
1802 unsigned int good_mdts;
1808 /* Totally skip qos part when qos_threshold_rr=100% */
1809 if (ltd->ltd_qos.lq_threshold_rr == QOS_THRESHOLD_MAX)
1812 LASSERT(stripe_idx <= stripe_count);
1813 if (stripe_idx == stripe_count)
1814 RETURN(stripe_count);
1816 /* use MDT pool in @ltd, once MDT pool is supported in the future, it
1817 * can be passed in as argument like OST object allocation.
1819 pool = <d->ltd_tgt_pool;
1821 /* Detect -EAGAIN early, before expensive lock is taken. */
1822 if (!ltd_qos_is_usable(ltd))
1825 rc = lod_qos_mdt_in_use_init(env, ltd, stripe_idx, stripe_count, pool,
1830 /* Do actual allocation, use write lock here. */
1831 down_write(<d->ltd_qos.lq_rw_sem);
1834 * Check again, while we were sleeping on @lq_rw_sem things could
1837 if (!ltd_qos_is_usable(ltd))
1838 GOTO(unlock, rc = -EAGAIN);
1840 rc = ltd_qos_penalties_calc(ltd);
1845 /* Find all the MDTs that are valid stripe candidates */
1846 for (i = 0; i < pool->op_count; i++) {
1847 if (!test_bit(pool->op_array[i], ltd->ltd_tgt_bitmap))
1850 mdt = LTD_TGT(ltd, pool->op_array[i]);
1851 mdt->ltd_qos.ltq_usable = 0;
1853 if (mdt->ltd_discon || lod_statfs_check(ltd, mdt))
1856 if (mdt->ltd_statfs.os_state & OS_STATFS_DEGRADED)
1859 mdt->ltd_qos.ltq_usable = 1;
1860 lu_tgt_qos_weight_calc(mdt, true);
1861 total_weight += mdt->ltd_qos.ltq_weight;
1866 CDEBUG(D_OTHER, "found %d good MDTs\n", good_mdts);
1868 if (good_mdts < stripe_count - stripe_idx)
1869 GOTO(unlock, rc = -EAGAIN);
1871 /* Find enough MDTs with weighted random allocation. */
1872 while (stripe_idx < stripe_count) {
1873 u64 rand, cur_weight;
1878 rand = lu_prandom_u64_max(total_weight);
1880 /* On average, this will hit larger-weighted MDTs more often.
1881 * 0-weight MDT will always get used last (only when rand=0) */
1882 for (i = 0; i < pool->op_count; i++) {
1885 mdt_idx = pool->op_array[i];
1886 mdt = LTD_TGT(ltd, mdt_idx);
1888 if (!mdt->ltd_qos.ltq_usable)
1891 cur_weight += mdt->ltd_qos.ltq_weight;
1893 CDEBUG(D_OTHER, "stripe_count=%d stripe_index=%d cur_weight=%llu rand=%llu total_weight=%llu\n",
1894 stripe_count, stripe_idx, cur_weight, rand,
1897 if (cur_weight < rand)
1900 CDEBUG(D_OTHER, "stripe=%d to idx=%d\n",
1901 stripe_idx, mdt_idx);
1903 if (lod_qos_is_tgt_used(env, mdt_idx, stripe_idx))
1906 rc2 = dt_fid_alloc(env, mdt->ltd_tgt, &fid, NULL, NULL);
1908 CDEBUG(D_OTHER, "can't alloc FID on #%u: %d\n",
1913 conf.loc_flags = LOC_F_NEW;
1914 dto = dt_locate_at(env, mdt->ltd_tgt, &fid,
1915 lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev,
1918 CDEBUG(D_OTHER, "can't alloc stripe on #%u: %d\n",
1919 mdt_idx, (int) PTR_ERR(dto));
1923 lod_qos_tgt_in_use(env, stripe_idx, mdt_idx);
1924 stripes[stripe_idx] = dto;
1925 ltd_qos_update(ltd, mdt, &total_weight);
1931 /* no MDT found on this iteration, give up */
1936 if (unlikely(stripe_idx != stripe_count)) {
1938 * when the decision to use weighted algorithm was made
1939 * we had enough appropriate OSPs, but this state can
1940 * change anytime (no space on MDT, broken connection, etc)
1941 * so it's possible OSP won't be able to provide us with
1942 * an object due to just changed state
1944 CDEBUG(D_OTHER, "%s: wanted %d objects, found only %d\n",
1945 lod2obd(lod)->obd_name, stripe_count, stripe_idx);
1946 for (i = saved_idx; i < stripe_idx; i++) {
1947 LASSERT(stripes[i] != NULL);
1948 dt_object_put(env, stripes[i]);
1952 /* makes sense to rebalance next time */
1953 set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags);
1954 clear_bit(LQ_SAME_SPACE, <d->ltd_qos.lq_flags);
1962 up_write(<d->ltd_qos.lq_rw_sem);
1968 * Check stripe count the caller can use.
1970 * For new layouts (no initialized components), check the total size of the
1971 * layout against the maximum EA size from the backing file system. This
1972 * stops us from creating a layout which will be too large once initialized.
1974 * For existing layouts (with initialized components):
1975 * Find the maximal possible stripe count not greater than \a stripe_count.
1976 * If the provided stripe count is 0, then the filesystem's default is used.
1978 * \param[in] lod LOD device
1979 * \param[in] lo The lod_object
1980 * \param[in] comp_idx The component id, which the amount of stripes is
1982 * \param[in] stripe_count count the caller would like to use
1984 * \retval the maximum usable stripe count
1986 __u16 lod_get_stripe_count_plain(struct lod_device *lod, struct lod_object *lo,
1987 __u16 stripe_count, bool overstriping,
1988 enum lod_uses_hint *flags)
1990 struct lov_desc *lov_desc = &lod->lod_ost_descs.ltd_lov_desc;
1993 stripe_count = lov_desc->ld_default_stripe_count;
1995 /* Overstriping allows more stripes than targets */
1996 if (stripe_count > lov_desc->ld_active_tgt_count && !overstriping) {
1997 *flags |= LOD_USES_DEFAULT_STRIPE;
1998 if (stripe_count == LOV_ALL_STRIPES && lod->lod_max_stripecount)
1999 stripe_count = lod->lod_max_stripecount;
2001 stripe_count = lov_desc->ld_active_tgt_count;
2006 if (overstriping && stripe_count > LOV_MAX_STRIPE_COUNT)
2007 stripe_count = LOV_MAX_STRIPE_COUNT;
2009 return stripe_count;
2012 __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo,
2013 int comp_idx, __u16 stripe_count, bool overstriping,
2014 enum lod_uses_hint *flags)
2016 __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD;
2017 /* max stripe count is based on OSD ea size */
2018 unsigned int easize = lod->lod_osd_max_easize;
2023 stripe_count = lod_get_stripe_count_plain(lod, lo, stripe_count,
2024 overstriping, flags);
2026 if (lo->ldo_is_composite) {
2027 struct lod_layout_component *lod_comp;
2028 unsigned int header_sz = sizeof(struct lov_comp_md_v1);
2029 unsigned int init_comp_sz = 0;
2030 unsigned int total_comp_sz = 0;
2031 unsigned int comp_sz;
2033 header_sz += sizeof(struct lov_comp_md_entry_v1) *
2036 for (i = 0; i < lo->ldo_comp_cnt; i++) {
2037 unsigned int stripes;
2042 lod_comp = &lo->ldo_comp_entries[i];
2043 /* Extension comp is never inited - 0 stripes on disk */
2044 stripes = lod_comp->llc_flags & LCME_FL_EXTENSION ? 0 :
2045 lod_comp->llc_stripe_count;
2047 comp_sz = lov_mds_md_size(stripes, LOV_MAGIC_V3);
2048 total_comp_sz += comp_sz;
2049 if (lod_comp->llc_flags & LCME_FL_INIT)
2050 init_comp_sz += comp_sz;
2053 if (init_comp_sz > 0)
2054 total_comp_sz = init_comp_sz;
2056 header_sz += total_comp_sz;
2058 if (easize > header_sz)
2059 easize -= header_sz;
2064 max_stripes = lov_mds_md_max_stripe_count(easize, LOV_MAGIC_V3);
2065 max_stripes = (max_stripes == 0) ? 0 : max_stripes - 1;
2067 stripe_count = min_t(__u16, stripe_count, max_stripes);
2068 RETURN(stripe_count);
2072 * Create in-core respresentation for a fully-defined striping
2074 * When the caller passes a fully-defined striping (i.e. everything including
2075 * OST object FIDs are defined), then we still need to instantiate LU-cache
2076 * with the objects representing the stripes defined. This function completes
2079 * \param[in] env execution environment for this thread
2080 * \param[in] mo LOD object
2081 * \param[in] buf buffer containing the striping
2083 * \retval 0 on success
2084 * \retval negative negated errno on error
2086 int lod_use_defined_striping(const struct lu_env *env,
2087 struct lod_object *mo,
2088 const struct lu_buf *buf)
2090 struct lod_layout_component *lod_comp;
2091 struct lov_mds_md_v1 *v1 = buf->lb_buf;
2092 struct lov_mds_md_v3 *v3 = buf->lb_buf;
2093 struct lov_comp_md_v1 *comp_v1 = NULL;
2094 struct lov_ost_data_v1 *objs;
2101 mutex_lock(&mo->ldo_layout_mutex);
2102 lod_striping_free_nolock(env, mo);
2104 magic = le32_to_cpu(v1->lmm_magic) & ~LOV_MAGIC_DEFINED;
2106 if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 &&
2107 magic != LOV_MAGIC_COMP_V1 && magic != LOV_MAGIC_FOREIGN)
2108 GOTO(unlock, rc = -EINVAL);
2110 if (magic == LOV_MAGIC_COMP_V1) {
2111 comp_v1 = buf->lb_buf;
2112 comp_cnt = le16_to_cpu(comp_v1->lcm_entry_count);
2114 GOTO(unlock, rc = -EINVAL);
2115 mirror_cnt = le16_to_cpu(comp_v1->lcm_mirror_count) + 1;
2116 mo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) &
2118 mo->ldo_is_composite = 1;
2119 } else if (magic == LOV_MAGIC_FOREIGN) {
2120 struct lov_foreign_md *foreign;
2123 if (buf->lb_len < offsetof(typeof(*foreign), lfm_value)) {
2125 "buf len %zu < min lov_foreign_md size (%zu)\n",
2127 offsetof(typeof(*foreign), lfm_value));
2128 GOTO(out, rc = -EINVAL);
2130 foreign = (struct lov_foreign_md *)buf->lb_buf;
2131 length = foreign_size_le(foreign);
2132 if (buf->lb_len < length) {
2134 "buf len %zu < this lov_foreign_md size (%zu)\n",
2135 buf->lb_len, length);
2136 GOTO(out, rc = -EINVAL);
2139 /* just cache foreign LOV EA raw */
2140 rc = lod_alloc_foreign_lov(mo, length);
2143 memcpy(mo->ldo_foreign_lov, buf->lb_buf, length);
2146 mo->ldo_is_composite = 0;
2150 mo->ldo_layout_gen = le16_to_cpu(v1->lmm_layout_gen);
2152 rc = lod_alloc_comp_entries(mo, mirror_cnt, comp_cnt);
2156 for (i = 0; i < comp_cnt; i++) {
2157 struct lu_extent *ext;
2161 lod_comp = &mo->ldo_comp_entries[i];
2163 if (mo->ldo_is_composite) {
2164 offs = le32_to_cpu(comp_v1->lcm_entries[i].lcme_offset);
2165 v1 = (struct lov_mds_md_v1 *)((char *)comp_v1 + offs);
2166 v3 = (struct lov_mds_md_v3 *)v1;
2167 magic = le32_to_cpu(v1->lmm_magic);
2169 ext = &comp_v1->lcm_entries[i].lcme_extent;
2170 lod_comp->llc_extent.e_start =
2171 le64_to_cpu(ext->e_start);
2172 lod_comp->llc_extent.e_end = le64_to_cpu(ext->e_end);
2173 lod_comp->llc_flags =
2174 le32_to_cpu(comp_v1->lcm_entries[i].lcme_flags);
2175 if (lod_comp->llc_flags & LCME_FL_NOSYNC)
2176 lod_comp->llc_timestamp = le64_to_cpu(
2177 comp_v1->lcm_entries[i].lcme_timestamp);
2179 le32_to_cpu(comp_v1->lcm_entries[i].lcme_id);
2180 if (lod_comp->llc_id == LCME_ID_INVAL)
2181 GOTO(out, rc = -EINVAL);
2185 if (magic == LOV_MAGIC_V1) {
2186 objs = &v1->lmm_objects[0];
2187 } else if (magic == LOV_MAGIC_V3) {
2188 objs = &v3->lmm_objects[0];
2189 if (v3->lmm_pool_name[0] != '\0')
2190 pool_name = v3->lmm_pool_name;
2192 CDEBUG(D_LAYOUT, "Invalid magic %x\n", magic);
2193 GOTO(out, rc = -EINVAL);
2196 lod_comp->llc_pattern = le32_to_cpu(v1->lmm_pattern);
2197 lod_comp->llc_stripe_size = le32_to_cpu(v1->lmm_stripe_size);
2198 lod_comp->llc_stripe_count = le16_to_cpu(v1->lmm_stripe_count);
2200 * limit stripe count so that it's less than/equal to
2201 * extent_size / stripe_size.
2203 * Note: extension size reused llc_stripe_size field and
2204 * uninstantiated component could be defined with
2205 * extent_start == extent_end as extension component will
2208 if (mo->ldo_is_composite &&
2209 !(lod_comp->llc_flags & LCME_FL_EXTENSION) &&
2210 (lod_comp_inited(lod_comp) ||
2211 lod_comp->llc_extent.e_start <
2212 lod_comp->llc_extent.e_end) &&
2213 lod_comp->llc_stripe_count != (__u16)-1 &&
2214 lod_comp->llc_extent.e_end != (__u64)-1 &&
2215 (__u64)lod_comp->llc_stripe_count *
2216 lod_comp->llc_stripe_size >
2217 (lod_comp->llc_extent.e_end - lod_comp->llc_extent.e_start))
2218 lod_comp->llc_stripe_count =
2219 DIV_ROUND_UP(lod_comp->llc_extent.e_end -
2220 lod_comp->llc_extent.e_start,
2221 lod_comp->llc_stripe_size);
2222 lod_comp->llc_layout_gen = le16_to_cpu(v1->lmm_layout_gen);
2224 * The stripe_offset of an uninit-ed component is stored in
2225 * the lmm_layout_gen
2227 if (mo->ldo_is_composite && !lod_comp_inited(lod_comp))
2228 lod_comp->llc_stripe_offset = lod_comp->llc_layout_gen;
2229 lod_obj_set_pool(mo, i, pool_name);
2231 if ((!mo->ldo_is_composite || lod_comp_inited(lod_comp)) &&
2232 !(lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED) &&
2233 !(lod_comp->llc_pattern & LOV_PATTERN_MDT)) {
2234 rc = lod_initialize_objects(env, mo, objs, i);
2240 rc = lod_fill_mirrors(mo);
2244 lod_striping_free_nolock(env, mo);
2246 mutex_unlock(&mo->ldo_layout_mutex);
2251 static void lod_qos_set_pool(struct lod_object *lo, int pos, char *pool_name,
2252 struct lov_user_md_v1 *v1)
2254 struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2255 struct lod_layout_component *lod_comp;
2256 struct pool_desc *pool = NULL;
2260 /* In the function below, .hs_keycmp resolves to
2261 * pool_hashkey_keycmp() */
2262 /* coverity[overrun-buffer-val] */
2264 pool = lod_find_pool(d, pool_name);
2269 lod_comp = &lo->ldo_comp_entries[pos];
2270 if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT)
2271 goto out_checkcount;
2273 if (v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
2274 struct lov_user_md_v3 *v3;
2276 v3 = (struct lov_user_md_v3 *)v1;
2277 for (j = 0; j < v3->lmm_stripe_count; j++) {
2278 idx = lod_comp->llc_ostlist.op_array[j];
2279 rc = lod_check_index_in_pool(idx, pool);
2284 idx = lod_comp->llc_stripe_offset;
2285 rc = lod_check_index_in_pool(idx, pool);
2289 goto out_checkcount;
2292 "%s: index %u is not in the pool %s, dropping the pool\n",
2293 lod2obd(d)->obd_name, idx, pool_name);
2298 if (lod_comp->llc_stripe_count > pool_tgt_count(pool) &&
2299 !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING))
2300 lod_comp->llc_stripe_count = pool_tgt_count(pool);
2302 lod_pool_putref(pool);
2304 lod_obj_set_pool(lo, pos, pool_name);
2309 * Parse suggested striping configuration.
2311 * The caller gets a suggested striping configuration from a number of sources
2312 * including per-directory default and applications. Then it needs to verify
2313 * the suggested striping is valid, apply missing bits and store the resulting
2314 * configuration in the object to be used by the allocator later. Must not be
2315 * called concurrently against the same object. It's OK to provide a
2316 * fully-defined striping.
2318 * \param[in] env execution environment for this thread
2319 * \param[in] lo LOD object
2320 * \param[in] buf buffer containing the striping
2322 * \retval 0 on success
2323 * \retval negative negated errno on error
2325 int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
2326 const struct lu_buf *buf)
2328 struct lod_layout_component *lod_comp;
2329 struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2330 struct lov_desc *desc = &d->lod_ost_descs.ltd_lov_desc;
2331 struct lov_user_md_v1 *v1 = NULL;
2332 struct lov_user_md_v3 *v3 = NULL;
2333 struct lov_comp_md_v1 *comp_v1 = NULL;
2334 struct lov_foreign_md *lfm = NULL;
2335 char def_pool[LOV_MAXPOOLNAME + 1];
2342 if (buf == NULL || buf->lb_buf == NULL || buf->lb_len == 0)
2345 memset(def_pool, 0, sizeof(def_pool));
2346 if (lo->ldo_comp_entries != NULL)
2347 lod_layout_get_pool(lo->ldo_comp_entries, lo->ldo_comp_cnt,
2348 def_pool, sizeof(def_pool));
2350 /* free default striping info */
2351 if (lo->ldo_is_foreign)
2352 lod_free_foreign_lov(lo);
2354 lod_free_comp_entries(lo);
2356 rc = lod_verify_striping(env, d, lo, buf, false);
2362 comp_v1 = buf->lb_buf;
2363 /* {lmm,lfm}_magic position/length work for all LOV formats */
2364 magic = v1->lmm_magic;
2366 if (unlikely(le32_to_cpu(magic) & LOV_MAGIC_DEFINED)) {
2367 /* try to use as fully defined striping */
2368 rc = lod_use_defined_striping(env, lo, buf);
2373 case __swab32(LOV_USER_MAGIC_V1):
2374 lustre_swab_lov_user_md_v1(v1);
2375 magic = v1->lmm_magic;
2377 case LOV_USER_MAGIC_V1:
2379 case __swab32(LOV_USER_MAGIC_V3):
2380 lustre_swab_lov_user_md_v3(v3);
2381 magic = v3->lmm_magic;
2383 case LOV_USER_MAGIC_V3:
2385 case __swab32(LOV_USER_MAGIC_SPECIFIC):
2386 lustre_swab_lov_user_md_v3(v3);
2387 lustre_swab_lov_user_md_objects(v3->lmm_objects,
2388 v3->lmm_stripe_count);
2389 magic = v3->lmm_magic;
2391 case LOV_USER_MAGIC_SPECIFIC:
2393 case __swab32(LOV_USER_MAGIC_COMP_V1):
2394 lustre_swab_lov_comp_md_v1(comp_v1);
2395 magic = comp_v1->lcm_magic;
2397 case LOV_USER_MAGIC_COMP_V1:
2399 case __swab32(LOV_USER_MAGIC_FOREIGN):
2401 __swab32s(&lfm->lfm_magic);
2402 __swab32s(&lfm->lfm_length);
2403 __swab32s(&lfm->lfm_type);
2404 __swab32s(&lfm->lfm_flags);
2405 magic = lfm->lfm_magic;
2407 case LOV_USER_MAGIC_FOREIGN:
2410 rc = lod_alloc_foreign_lov(lo, foreign_size(lfm));
2413 memcpy(lo->ldo_foreign_lov, buf->lb_buf, foreign_size(lfm));
2416 CERROR("%s: unrecognized magic %X\n",
2417 lod2obd(d)->obd_name, magic);
2421 lustre_print_user_md(D_OTHER, v1, "parse config");
2423 if (magic == LOV_USER_MAGIC_COMP_V1) {
2424 comp_cnt = comp_v1->lcm_entry_count;
2427 mirror_cnt = comp_v1->lcm_mirror_count + 1;
2429 lo->ldo_flr_state = LCM_FL_RDONLY;
2430 lo->ldo_is_composite = 1;
2434 lo->ldo_is_composite = 0;
2437 rc = lod_alloc_comp_entries(lo, mirror_cnt, comp_cnt);
2441 LASSERT(lo->ldo_comp_entries);
2443 for (i = 0; i < comp_cnt; i++) {
2444 struct lu_extent *ext;
2447 lod_comp = &lo->ldo_comp_entries[i];
2449 if (lo->ldo_is_composite) {
2450 v1 = (struct lov_user_md *)((char *)comp_v1 +
2451 comp_v1->lcm_entries[i].lcme_offset);
2452 ext = &comp_v1->lcm_entries[i].lcme_extent;
2453 lod_comp->llc_extent = *ext;
2454 lod_comp->llc_flags =
2455 comp_v1->lcm_entries[i].lcme_flags &
2460 if (def_pool[0] != '\0')
2461 pool_name = def_pool;
2463 if (v1->lmm_magic == LOV_USER_MAGIC_V3 ||
2464 v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
2465 v3 = (struct lov_user_md_v3 *)v1;
2467 if (lov_pool_is_ignored(v3->lmm_pool_name))
2469 else if (v3->lmm_pool_name[0] != '\0' &&
2470 !lov_pool_is_inherited(v3->lmm_pool_name))
2471 pool_name = v3->lmm_pool_name;
2473 if (v3->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
2474 rc = lod_comp_copy_ost_lists(lod_comp, v3);
2476 GOTO(free_comp, rc);
2480 if (v1->lmm_pattern == 0)
2481 v1->lmm_pattern = LOV_PATTERN_RAID0;
2482 if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0 &&
2483 lov_pattern(v1->lmm_pattern) != LOV_PATTERN_MDT &&
2484 lov_pattern(v1->lmm_pattern) !=
2485 (LOV_PATTERN_RAID0 | LOV_PATTERN_OVERSTRIPING)) {
2486 CDEBUG(D_LAYOUT, "%s: invalid pattern: %x\n",
2487 lod2obd(d)->obd_name, v1->lmm_pattern);
2488 GOTO(free_comp, rc = -EINVAL);
2491 lod_comp->llc_pattern = v1->lmm_pattern;
2492 lod_comp->llc_stripe_size = v1->lmm_stripe_size;
2493 lod_adjust_stripe_size(lod_comp, desc->ld_default_stripe_size);
2495 lod_comp->llc_stripe_count = desc->ld_default_stripe_count;
2496 if (v1->lmm_stripe_count ||
2497 lov_pattern(v1->lmm_pattern) == LOV_PATTERN_MDT)
2498 lod_comp->llc_stripe_count = v1->lmm_stripe_count;
2500 if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT &&
2501 lod_comp->llc_stripe_count != 0) {
2502 CDEBUG(D_LAYOUT, "%s: invalid stripe count: %u\n",
2503 lod2obd(d)->obd_name,
2504 lod_comp->llc_stripe_count);
2505 GOTO(free_comp, rc = -EINVAL);
2508 * limit stripe count so that it's less than/equal to
2509 * extent_size / stripe_size.
2511 * Note: extension size reused llc_stripe_size field and
2512 * uninstantiated component could be defined with
2513 * extent_start == extent_end as extension component will
2516 if (lo->ldo_is_composite &&
2517 !(lod_comp->llc_flags & LCME_FL_EXTENSION) &&
2518 lod_comp->llc_stripe_count != (__u16)-1 &&
2519 (lod_comp_inited(lod_comp) ||
2520 lod_comp->llc_extent.e_start <
2521 lod_comp->llc_extent.e_end) &&
2522 lod_comp->llc_extent.e_end != (__u64)-1 &&
2523 lod_comp->llc_stripe_count * lod_comp->llc_stripe_size >
2524 (lod_comp->llc_extent.e_end - lod_comp->llc_extent.e_start))
2525 lod_comp->llc_stripe_count =
2526 DIV_ROUND_UP(lod_comp->llc_extent.e_end -
2527 lod_comp->llc_extent.e_start,
2528 lod_comp->llc_stripe_size);
2530 lod_comp->llc_stripe_offset = v1->lmm_stripe_offset;
2531 lod_qos_set_pool(lo, i, pool_name, v1);
2537 lod_free_comp_entries(lo);
2542 * prepare enough OST avoidance bitmap space
2544 int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo)
2546 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
2547 struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
2548 unsigned long *bitmap = NULL;
2549 __u32 *new_oss = NULL;
2551 lag->lag_ost_avail = lod->lod_ost_count;
2553 /* reset OSS avoid guide array */
2554 lag->lag_oaa_count = 0;
2555 if (lag->lag_oss_avoid_array &&
2556 lag->lag_oaa_size < lod->lod_ost_count) {
2557 OBD_FREE_PTR_ARRAY(lag->lag_oss_avoid_array, lag->lag_oaa_size);
2558 lag->lag_oss_avoid_array = NULL;
2559 lag->lag_oaa_size = 0;
2562 /* init OST avoid guide bitmap */
2563 if (lag->lag_ost_avoid_bitmap) {
2564 if (lod->lod_ost_count <= lag->lag_ost_avoid_size) {
2565 bitmap_zero(lag->lag_ost_avoid_bitmap,
2566 lag->lag_ost_avoid_size);
2568 bitmap_free(lag->lag_ost_avoid_bitmap);
2569 lag->lag_ost_avoid_bitmap = NULL;
2573 if (!lag->lag_ost_avoid_bitmap) {
2574 bitmap = bitmap_zalloc(lod->lod_ost_count, GFP_KERNEL);
2579 if (!lag->lag_oss_avoid_array) {
2581 * usually there are multiple OSTs in one OSS, but we don't
2582 * know the exact OSS number, so we choose a safe option,
2583 * using OST count to allocate the array to store the OSS
2586 OBD_ALLOC_PTR_ARRAY(new_oss, lod->lod_ost_count);
2588 bitmap_free(bitmap);
2594 lag->lag_oss_avoid_array = new_oss;
2595 lag->lag_oaa_size = lod->lod_ost_count;
2598 lag->lag_ost_avoid_bitmap = bitmap;
2599 lag->lag_ost_avoid_size = lod->lod_ost_count;
2606 * Collect information of used OSTs and OSSs in the overlapped components
2609 void lod_collect_avoidance(struct lod_object *lo, struct lod_avoid_guide *lag,
2612 struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
2613 struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
2614 unsigned long *bitmap = lag->lag_ost_avoid_bitmap;
2617 /* iterate components */
2618 for (i = 0; i < lo->ldo_comp_cnt; i++) {
2619 struct lod_layout_component *comp;
2622 * skip mirror containing component[comp_idx], we only
2623 * collect OSTs info of conflicting component in other mirrors,
2624 * so that during read, if OSTs of a mirror's component are
2625 * not available, we still have other mirror with different
2626 * OSTs to read the data.
2628 comp = &lo->ldo_comp_entries[i];
2629 if (comp->llc_id != LCME_ID_INVAL &&
2630 mirror_id_of(comp->llc_id) ==
2631 mirror_id_of(lod_comp->llc_id))
2635 * skip non-overlapped or un-instantiated components,
2636 * NOTE: don't use lod_comp_inited(comp) to judge
2637 * whether @comp has been inited, since during
2638 * declare phase, comp->llc_stripe has been allocated
2639 * while it's init flag not been set until the exec
2642 if (!lu_extent_is_overlapped(&comp->llc_extent,
2643 &lod_comp->llc_extent) ||
2648 * collect used OSTs index and OSS info from a
2651 for (j = 0; j < comp->llc_stripe_count; j++) {
2652 struct lod_tgt_desc *ost;
2653 struct lu_svr_qos *lsq;
2656 ost = OST_TGT(lod, comp->llc_ost_indices[j]);
2657 lsq = ost->ltd_qos.ltq_svr;
2659 if (test_bit(ost->ltd_index, bitmap))
2662 CDEBUG(D_OTHER, "OST%d used in conflicting mirror component\n", ost->ltd_index);
2663 set_bit(ost->ltd_index, bitmap);
2664 lag->lag_ost_avail--;
2666 for (k = 0; k < lag->lag_oaa_count; k++) {
2667 if (lag->lag_oss_avoid_array[k] ==
2671 if (k == lag->lag_oaa_count) {
2672 lag->lag_oss_avoid_array[k] =
2674 lag->lag_oaa_count++;
2681 * Create a striping for an obejct.
2683 * The function creates a new striping for the object. The function tries QoS
2684 * algorithm first unless free space is distributed evenly among OSTs, but
2685 * by default RR algorithm is preferred due to internal concurrency (QoS is
2686 * serialized). The caller must ensure no concurrent calls to the function
2687 * are made against the same object.
2689 * \param[in] env execution environment for this thread
2690 * \param[in] lo LOD object
2691 * \param[in] attr attributes OST objects will be declared with
2692 * \param[in] th transaction handle
2693 * \param[in] comp_idx index of ldo_comp_entries
2695 * \retval 0 on success
2696 * \retval negative negated errno on error
2698 int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
2699 struct lu_attr *attr, struct thandle *th,
2700 int comp_idx, __u64 reserve)
2702 struct lod_layout_component *lod_comp;
2703 struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2704 struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
2705 struct dt_object **stripe = NULL;
2706 __u32 *ost_indices = NULL;
2707 enum lod_uses_hint flags = LOD_USES_ASSIGNED_STRIPE;
2713 LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL);
2714 lod_comp = &lo->ldo_comp_entries[comp_idx];
2715 LASSERT(!(lod_comp->llc_flags & LCME_FL_EXTENSION));
2717 /* A released component is being created */
2718 if (lod_comp->llc_pattern & LOV_PATTERN_F_RELEASED)
2721 /* A Data-on-MDT component is being created */
2722 if (lov_pattern(lod_comp->llc_pattern) == LOV_PATTERN_MDT)
2725 if (lod_comp->llc_pool)
2726 lod_check_and_spill_pool(env, d, &lod_comp->llc_pool);
2728 if (likely(lod_comp->llc_stripe == NULL)) {
2730 * no striping has been created so far
2732 LASSERT(lod_comp->llc_stripe_count);
2734 * statfs and check OST targets now, since ld_active_tgt_count
2735 * could be changed if some OSTs are [de]activated manually.
2737 lod_qos_statfs_update(env, d, &d->lod_ost_descs);
2738 stripe_len = lod_get_stripe_count(d, lo, comp_idx,
2739 lod_comp->llc_stripe_count,
2740 lod_comp->llc_pattern &
2741 LOV_PATTERN_OVERSTRIPING,
2744 if (stripe_len == 0)
2745 GOTO(out, rc = -ERANGE);
2746 lod_comp->llc_stripe_count = stripe_len;
2747 OBD_ALLOC_PTR_ARRAY(stripe, stripe_len);
2749 GOTO(out, rc = -ENOMEM);
2750 OBD_ALLOC_PTR_ARRAY(ost_indices, stripe_len);
2752 GOTO(out, rc = -ENOMEM);
2755 lod_getref(&d->lod_ost_descs);
2756 /* XXX: support for non-0 files w/o objects */
2757 CDEBUG(D_OTHER, "tgt_count %d stripe_count %d\n",
2758 d->lod_ost_count, stripe_len);
2760 if (lod_comp->llc_ostlist.op_array &&
2761 lod_comp->llc_ostlist.op_count) {
2762 rc = lod_alloc_ost_list(env, lo, stripe, ost_indices,
2763 th, comp_idx, reserve);
2764 } else if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT) {
2766 * collect OSTs and OSSs used in other mirrors whose
2767 * components cross the ldo_comp_entries[comp_idx]
2769 rc = lod_prepare_avoidance(env, lo);
2773 CDEBUG(D_OTHER, "collecting conflict osts for comp[%d]\n",
2775 lod_collect_avoidance(lo, lag, comp_idx);
2777 rc = lod_ost_alloc_qos(env, lo, stripe, ost_indices,
2778 flags, th, comp_idx, reserve);
2780 rc = lod_ost_alloc_rr(env, lo, stripe,
2781 ost_indices, flags, th,
2784 rc = lod_ost_alloc_specific(env, lo, stripe,
2785 ost_indices, flags, th,
2789 lod_putref(d, &d->lod_ost_descs);
2791 for (i = 0; i < stripe_len; i++)
2792 if (stripe[i] != NULL)
2793 dt_object_put(env, stripe[i]);
2795 /* In case there is no space on any OST, let's ignore
2796 * the @reserve space to avoid an error at the init
2797 * time, probably the actual IO will be less than the
2798 * given @reserve space (aka extension_size). */
2803 lod_comp->llc_stripe_count = 0;
2805 lod_comp->llc_layout_gen = 0;
2806 lod_comp->llc_stripe = stripe;
2807 lod_comp->llc_ost_indices = ost_indices;
2808 lod_comp->llc_stripes_allocated = stripe_len;
2812 * lod_qos_parse_config() found supplied buf as a predefined
2813 * striping (not a hint), so it allocated all the object
2814 * now we need to create them
2816 for (i = 0; i < lod_comp->llc_stripe_count; i++) {
2817 struct dt_object *o;
2819 o = lod_comp->llc_stripe[i];
2822 rc = lod_sub_declare_create(env, o, attr, NULL,
2825 CERROR("can't declare create: %d\n", rc);
2830 * Clear LCME_FL_INIT for the component so that
2831 * lod_striping_create() can create the striping objects
2834 lod_comp_unset_init(lod_comp);
2840 OBD_FREE_PTR_ARRAY(stripe, stripe_len);
2842 OBD_FREE_PTR_ARRAY(ost_indices, stripe_len);
2847 int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
2848 struct lu_attr *attr, const struct lu_buf *buf,
2852 struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
2860 /* no OST available */
2861 /* XXX: should we be waiting a bit to prevent failures during
2862 * cluster initialization? */
2863 if (!d->lod_ost_count)
2867 * by this time, the object's ldo_stripe_count and ldo_stripe_size
2868 * contain default value for striping: taken from the parent
2869 * or from filesystem defaults
2871 * in case the caller is passing lovea with new striping config,
2872 * we may need to parse lovea and apply new configuration
2874 rc = lod_qos_parse_config(env, lo, buf);
2878 if (attr->la_valid & LA_SIZE)
2879 size = attr->la_size;
2882 * prepare OST object creation for the component covering file's
2883 * size, the 1st component (including plain layout file) is always
2886 for (i = 0; i < lo->ldo_comp_cnt; i++) {
2887 struct lod_layout_component *lod_comp;
2888 struct lu_extent *extent;
2890 lod_comp = &lo->ldo_comp_entries[i];
2891 extent = &lod_comp->llc_extent;
2892 CDEBUG(D_OTHER, "comp[%d] %lld "DEXT"\n", i, size, PEXT(extent));
2893 if (!lo->ldo_is_composite || size >= extent->e_start) {
2894 rc = lod_qos_prep_create(env, lo, attr, th, i, 0);