Whamcloud - gitweb
LU-11146 lustre: fix setstripe for specific osts upon dir
[fs/lustre-release.git] / lustre / lod / lod_qos.c
index 4943bc2..09a0630 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright  2009 Sun Microsystems, Inc. All rights reserved
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2016, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -77,6 +77,7 @@ int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
        struct obd_export  *exp = ost_desc->ltd_exp;
        int                 rc = 0, found = 0;
        struct list_head   *list;
+       __u32 id = 0;
        ENTRY;
 
        down_write(&lod->lod_qos.lq_rw_sem);
@@ -91,6 +92,8 @@ int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
                        found++;
                        break;
                }
+               if (oss->lqo_id > id)
+                       id = oss->lqo_id;
        }
 
        if (!found) {
@@ -99,6 +102,8 @@ int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
                        GOTO(out, rc = -ENOMEM);
                memcpy(&oss->lqo_uuid, &exp->exp_connection->c_remote_uuid,
                       sizeof(oss->lqo_uuid));
+               ++id;
+               oss->lqo_id = id;
        } else {
                /* Assume we have to move this one */
                list_del(&oss->lqo_oss_list);
@@ -211,6 +216,10 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
        if (sfs->os_state & OS_STATE_READONLY)
                rc = -EROFS;
 
+       /* object precreation is skipped on the OST with max_create_count=0 */
+       if (sfs->os_state & OS_STATE_NOPRECREATE)
+               rc = -ENOBUFS;
+
        /* check whether device has changed state (active, inactive) */
        if (rc != 0 && ost->ltd_active) {
                /* turned inactive? */
@@ -259,24 +268,25 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
  * \param[in] env      execution environment for this thread
  * \param[in] lod      LOD device
  */
-static void lod_qos_statfs_update(const struct lu_env *env,
-                                 struct lod_device *lod)
+void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod)
 {
        struct obd_device *obd = lod2obd(lod);
-       struct ost_pool   *osts = &(lod->lod_pool_info);
-       unsigned int       i;
-       int                idx;
-       __u64              max_age, avail;
+       struct ost_pool *osts = &(lod->lod_pool_info);
+       time64_t max_age;
+       unsigned int i;
+       u64 avail;
+       int idx;
        ENTRY;
 
-       max_age = cfs_time_shift_64(-2 * lod->lod_desc.ld_qos_maxage);
+       max_age = ktime_get_seconds() - 2 * lod->lod_desc.ld_qos_maxage;
 
-       if (cfs_time_beforeq_64(max_age, obd->obd_osfs_age))
+       if (obd->obd_osfs_age > max_age)
                /* statfs data are quite recent, don't need to refresh it */
                RETURN_EXIT;
 
        down_write(&lod->lod_qos.lq_rw_sem);
-       if (cfs_time_beforeq_64(max_age, obd->obd_osfs_age))
+
+       if (obd->obd_osfs_age > max_age)
                goto out;
 
        for (i = 0; i < osts->op_count; i++) {
@@ -289,7 +299,7 @@ static void lod_qos_statfs_update(const struct lu_env *env,
                        /* recalculate weigths */
                        lod->lod_qos.lq_dirty = 1;
        }
-       obd->obd_osfs_age = cfs_time_current_64();
+       obd->obd_osfs_age = ktime_get_seconds();
 
 out:
        up_write(&lod->lod_qos.lq_rw_sem);
@@ -319,7 +329,7 @@ static int lod_qos_calc_ppo(struct lod_device *lod)
        __u32               num_active;
        unsigned int        i;
        int                 rc, prio_wide;
-       time_t              now, age;
+       time64_t            now, age;
        ENTRY;
 
        if (!lod->lod_qos.lq_dirty)
@@ -343,9 +353,10 @@ static int lod_qos_calc_ppo(struct lod_device *lod)
 
        ba_min = (__u64)(-1);
        ba_max = 0;
-       now = cfs_time_current_sec();
+       now = ktime_get_real_seconds();
        /* Calculate OST penalty per object
-        * (lod ref taken in lod_qos_prep_create()) */
+        * (lod ref taken in lod_qos_prep_create())
+        */
        cfs_foreach_bit(lod->lod_ost_bitmap, i) {
                LASSERT(OST_TGT(lod,i));
                temp = TGT_BAVAIL(i);
@@ -481,7 +492,7 @@ static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts,
        oss->lqo_penalty >>= 1;
 
        /* mark the OSS and OST as recently used */
-       ost->ltd_qos.ltq_used = oss->lqo_used = cfs_time_current_sec();
+       ost->ltd_qos.ltq_used = oss->lqo_used = ktime_get_real_seconds();
 
        /* Set max penalties for this OST and OSS */
        ost->ltd_qos.ltq_penalty +=
@@ -819,61 +830,122 @@ static int lod_qos_is_ost_used(const struct lu_env *env, int ost, __u32 stripes)
        return 0;
 }
 
+static inline bool
+lod_obj_is_ost_use_skip_cb(const struct lu_env *env, struct lod_object *lo,
+                          int comp_idx, struct lod_obj_stripe_cb_data *data)
+{
+       struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
+
+       return comp->llc_ost_indices == NULL;
+}
+
+static inline int
+lod_obj_is_ost_use_cb(const struct lu_env *env, struct lod_object *lo,
+                     int comp_idx, struct lod_obj_stripe_cb_data *data)
+{
+       struct lod_layout_component *comp = &lo->ldo_comp_entries[comp_idx];
+       int i;
+
+       for (i = 0; i < comp->llc_stripe_count; i++) {
+               if (comp->llc_ost_indices[i] == data->locd_ost_index) {
+                       data->locd_ost_index = -1;
+                       return -EEXIST;
+               }
+       }
+
+       return 0;
+}
+
 /**
  * Check is OST used in a composite layout
  *
- * \param[in] inuse    all inuse ost indexs
+ * \param[in] lo       lod object
  * \param[in] ost      OST target index to check
  *
- * \retval 0           not used
- * \retval 1           used
+ * \retval false       not used
+ * \retval true                used
  */
-static inline int lod_comp_is_ost_used(struct ost_pool *inuse, int ost)
+static inline bool lod_comp_is_ost_used(const struct lu_env *env,
+                                      struct lod_object *lo, int ost)
 {
-       __u32 j;
-       LASSERT(inuse != NULL);
+       struct lod_obj_stripe_cb_data data = { { 0 } };
 
-       if (inuse->op_size == 0)
-               return 0;
+       data.locd_ost_index = ost;
+       data.locd_comp_skip_cb = lod_obj_is_ost_use_skip_cb;
+       data.locd_comp_cb = lod_obj_is_ost_use_cb;
 
-       LASSERT(inuse->op_count * sizeof(inuse->op_array[0]) <= inuse->op_size);
-       for (j = 0; j < inuse->op_count; j++) {
-               if (inuse->op_array[j] == ost)
-                       return 1;
-       }
-       return 0;
+       (void)lod_obj_for_each_stripe(env, lo, NULL, &data);
+
+       return data.locd_ost_index == -1;
 }
 
-/**
- * Mark the given target as used for a composite layout
- *
- * \param[in] inuse    inuse ost index array
- * \param[in] idx      index in the array
- */
-static inline void lod_comp_ost_in_use(struct ost_pool *inuse, int ost)
+static inline void lod_avoid_update(struct lod_object *lo,
+                                   struct lod_avoid_guide *lag)
+{
+       if (!lod_is_flr(lo))
+               return;
+
+       lag->lag_ost_avail--;
+}
+
+static inline bool lod_should_avoid_ost(struct lod_object *lo,
+                                       struct lod_avoid_guide *lag,
+                                       __u32 index)
 {
-       LASSERT(inuse != NULL);
-       if (inuse->op_size && !lod_comp_is_ost_used(inuse, ost)) {
-               LASSERT(inuse->op_count * sizeof(inuse->op_array[0]) <
-                       inuse->op_size);
-               inuse->op_array[inuse->op_count] = ost;
-               inuse->op_count++;
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_tgt_desc *ost = OST_TGT(lod, index);
+       struct lod_qos_oss *lqo = ost->ltd_qos.ltq_oss;
+       bool used = false;
+       int i;
+
+       if (!cfs_bitmap_check(lod->lod_ost_bitmap, index))
+               return true;
+
+       /**
+        * we've tried our best, all available OSTs have been used in
+        * overlapped components in the other mirror
+        */
+       if (lag->lag_ost_avail == 0)
+               return false;
+
+       /* check OSS use */
+       for (i = 0; i < lag->lag_oaa_count; i++) {
+               if (lag->lag_oss_avoid_array[i] == lqo->lqo_id) {
+                       used = true;
+                       break;
+               }
        }
+       /**
+        * if the OSS which OST[index] resides has not been used, we'd like to
+        * use it
+        */
+       if (!used)
+               return false;
+
+       /* if the OSS has been used, check whether the OST has been used */
+       if (!cfs_bitmap_check(lag->lag_ost_avoid_bitmap, index))
+               used = false;
+       else
+               QOS_DEBUG("OST%d: has been used in overlapped component "
+                         "in other mirror\n", index);
+       return used;
 }
 
 static int lod_check_and_reserve_ost(const struct lu_env *env,
-                                    struct lod_device *m,
+                                    struct lod_object *lo,
                                     struct obd_statfs *sfs, __u32 ost_idx,
                                     __u32 speed, __u32 *s_idx,
                                     struct dt_object **stripe,
-                                    struct thandle *th,
-                                    struct ost_pool *inuse)
+                                    __u32 *ost_indices,
+                                    struct thandle *th)
 {
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
        struct dt_object   *o;
        __u32 stripe_idx = *s_idx;
        int rc;
 
-       rc = lod_statfs_and_check(env, m, ost_idx, sfs);
+       rc = lod_statfs_and_check(env, lod, ost_idx, sfs);
        if (rc) {
                /* this OSP doesn't feel well */
                goto out_return;
@@ -900,18 +972,27 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
         * try not allocate on OST which has been used by other
         * component
         */
-       if (speed == 0 && lod_comp_is_ost_used(inuse, ost_idx)) {
+       if (speed == 0 && lod_comp_is_ost_used(env, lo, ost_idx)) {
                QOS_DEBUG("#%d: used by other component\n", ost_idx);
                goto out_return;
        }
 
+       /**
+        * try not allocate OSTs used by conflicting component of other mirrors
+        * for the first and second time.
+        */
+       if (speed < 2 && lod_should_avoid_ost(lo, lag, ost_idx)) {
+               QOS_DEBUG("#%d: used by overlapped component of other mirror\n",
+                         ost_idx);
+               goto out_return;
+       }
        /*
         * do not put >1 objects on a single OST
         */
        if (lod_qos_is_ost_used(env, ost_idx, stripe_idx))
                goto out_return;
 
-       o = lod_qos_declare_object_on(env, m, ost_idx, th);
+       o = lod_qos_declare_object_on(env, lod, ost_idx, th);
        if (IS_ERR(o)) {
                CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
                       ost_idx, (int) PTR_ERR(o));
@@ -922,9 +1003,10 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
        /*
         * We've successfully declared (reserved) an object
         */
+       lod_avoid_update(lo, lag);
        lod_qos_ost_in_use(env, stripe_idx, ost_idx);
-       lod_comp_ost_in_use(inuse, ost_idx);
        stripe[stripe_idx] = o;
+       ost_indices[stripe_idx] = ost_idx;
        OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LOV_CREATE_RACE, 2);
        stripe_idx++;
        *s_idx = stripe_idx;
@@ -947,22 +1029,21 @@ out_return:
  * time we give priority to targets which already have objects precreated.
  * Full OSTs are skipped (see lod_qos_dev_is_full() for the details).
  *
- * \param[in] env      execution environment for this thread
- * \param[in] lo       LOD object
- * \param[out] stripe  striping created
- * \param[in] flags    allocation flags (0 or LOV_USES_DEFAULT_STRIPE)
- * \param[in] th       transaction handle
- * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out] inuse        array of inuse ost index
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ * \param[out] ost_indices     ost indices of striping created
+ * \param[in] flags            allocation flags (0 or LOV_USES_DEFAULT_STRIPE)
+ * \param[in] th               transaction handle
+ * \param[in] comp_idx         index of ldo_comp_entries
  *
  * \retval 0           on success
  * \retval -ENOSPC     if not enough OSTs are found
  * \retval negative    negated errno for other failures
  */
 static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
-                       struct dt_object **stripe, int flags,
-                       struct thandle *th, int comp_idx,
-                       struct ost_pool *inuse)
+                       struct dt_object **stripe, __u32 *ost_indices,
+                       int flags, struct thandle *th, int comp_idx)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
@@ -1048,8 +1129,9 @@ repeat_find:
                        continue;
 
                spin_unlock(&lqr->lqr_alloc);
-               rc = lod_check_and_reserve_ost(env, m, sfs, ost_idx, speed,
-                                              &stripe_idx, stripe, th, inuse);
+               rc = lod_check_and_reserve_ost(env, lo, sfs, ost_idx, speed,
+                                              &stripe_idx, stripe, ost_indices,
+                                              th);
                spin_lock(&lqr->lqr_alloc);
 
                if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting)
@@ -1102,12 +1184,12 @@ out:
  * structures are protected, but no concurrent allocation is allowed on the
  * same objects.
  *
- * \param[in] env      execution environment for this thread
- * \param[in] lo       LOD object
- * \param[out] stripe  striping created
- * \param[in] th       transaction handle
- * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out] inuse        array of inuse ost index
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ * \param[out] ost_indices     ost indices of striping created
+ * \param[in] th               transaction handle
+ * \param[in] comp_idx         index of ldo_comp_entries
  *
  * \retval 0           on success
  * \retval -ENODEV     OST index does not exist on file system
@@ -1115,8 +1197,8 @@ out:
  * \retval negative    negated errno on error
  */
 static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
-                             struct dt_object **stripe, struct thandle *th,
-                             int comp_idx, struct ost_pool *inuse)
+                             struct dt_object **stripe, __u32 *ost_indices,
+                             struct thandle *th, int comp_idx)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device       *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
@@ -1137,6 +1219,10 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
        if (rc < 0)
                RETURN(rc);
 
+       if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT)
+               lod_comp->llc_stripe_offset =
+                               lod_comp->llc_ostlist.op_array[0];
+
        for (i = 0; i < lod_comp->llc_stripe_count; i++) {
                if (lod_comp->llc_ostlist.op_array[i] ==
                    lod_comp->llc_stripe_offset) {
@@ -1185,8 +1271,8 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
                 * We've successfully declared (reserved) an object
                 */
                lod_qos_ost_in_use(env, stripe_count, ost_idx);
-               lod_comp_ost_in_use(inuse, ost_idx);
                stripe[stripe_count] = o;
+               ost_indices[stripe_count] = ost_idx;
                stripe_count++;
        }
 
@@ -1205,13 +1291,13 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
  * release the stripes allocated. All the internal structures are protected,
  * but no concurrent allocation is allowed on the same objects.
  *
- * \param[in] env      execution environment for this thread
- * \param[in] lo       LOD object
- * \param[out] stripe  striping created
- * \param[in] flags    not used
- * \param[in] th       transaction handle
- * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out]inuse array of inuse ost index
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ * \param[out] ost_indices     ost indices of striping created
+ * \param[in] flags            not used
+ * \param[in] th               transaction handle
+ * \param[in] comp_idx         index of ldo_comp_entries
  *
  * \retval 0           on success
  * \retval -ENOSPC     if no OST objects are available at all
@@ -1220,9 +1306,8 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
  * \retval negative    errno on failure
  */
 static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo,
-                             struct dt_object **stripe, int flags,
-                             struct thandle *th, int comp_idx,
-                             struct ost_pool *inuse)
+                             struct dt_object **stripe, __u32 *ost_indices,
+                             int flags, struct thandle *th, int comp_idx)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
@@ -1293,7 +1378,7 @@ repeat_find:
                 * try not allocate on the OST used by other component
                 */
                if (speed == 0 && i != 0 &&
-                   lod_comp_is_ost_used(inuse, ost_idx))
+                   lod_comp_is_ost_used(env, lo, ost_idx))
                        continue;
 
                /* Drop slow OSCs if we can, but not for requested start idx.
@@ -1309,9 +1394,9 @@ repeat_find:
                }
 
                /*
-                * We expect number of precreated objects in f_ffree at
-                * the first iteration, skip OSPs with no objects ready
-                * don't apply this logic to OST specified with stripe_offset
+                * We expect number of precreated objects at the first
+                * iteration.  Skip OSPs with no objects ready.  Don't apply
+                * this logic to OST specified with stripe_offset.
                 */
                if (i != 0 && sfs->os_fprecreated == 0 && speed == 0)
                        continue;
@@ -1327,8 +1412,8 @@ repeat_find:
                 * We've successfully declared (reserved) an object
                 */
                lod_qos_ost_in_use(env, stripe_num, ost_idx);
-               lod_comp_ost_in_use(inuse, ost_idx);
                stripe[stripe_num] = o;
+               ost_indices[stripe_num] = ost_idx;
                stripe_num++;
 
                /* We have enough stripes */
@@ -1411,13 +1496,13 @@ static inline int lod_qos_is_usable(struct lod_device *lod)
  * An OST with a higher weight is proportionately more likely to be selected
  * than one with a lower weight.
  *
- * \param[in] env      execution environment for this thread
- * \param[in] lo       LOD object
- * \param[out] stripe  striping created
- * \param[in] flags    0 or LOV_USES_DEFAULT_STRIPE
- * \param[in] th       transaction handle
- * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out]inuse array of inuse ost index
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ * \param[out] ost_indices     ost indices of striping created
+ * \param[in] flags            0 or LOV_USES_DEFAULT_STRIPE
+ * \param[in] th               transaction handle
+ * \param[in] comp_idx         index of ldo_comp_entries
  *
  * \retval 0           on success
  * \retval -EAGAIN     not enough OSTs are found for specified stripe count
@@ -1425,20 +1510,20 @@ static inline int lod_qos_is_usable(struct lod_device *lod)
  * \retval negative    errno on failure
  */
 static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
-                        struct dt_object **stripe, int flags,
-                        struct thandle *th, int comp_idx,
-                        struct ost_pool *inuse)
+                        struct dt_object **stripe, __u32 *ost_indices,
+                        int flags, struct thandle *th, int comp_idx)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
        struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
+       struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
        struct lod_tgt_desc *ost;
        struct dt_object *o;
        __u64 total_weight = 0;
        struct pool_desc *pool = NULL;
        struct ost_pool *osts;
        unsigned int i;
-       __u32   nfound, good_osts, stripe_count, stripe_count_min;
+       __u32 nfound, good_osts, stripe_count, stripe_count_min;
        int rc = 0;
        ENTRY;
 
@@ -1559,7 +1644,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                for (i = 0; i < osts->op_count; i++) {
                        __u32 idx = osts->op_array[i];
 
-                       if (!cfs_bitmap_check(lod->lod_ost_bitmap, idx))
+                       if (lod_should_avoid_ost(lo, lag, idx))
                                continue;
 
                        ost = OST_TGT(lod, idx);
@@ -1581,7 +1666,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                         * do not put >1 objects on a single OST
                         */
                        if (lod_qos_is_ost_used(env, idx, nfound) ||
-                           lod_comp_is_ost_used(inuse, idx))
+                           lod_comp_is_ost_used(env, lo, idx))
                                continue;
 
                        o = lod_qos_declare_object_on(env, lod, idx, th);
@@ -1591,10 +1676,12 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                                continue;
                        }
 
+                       lod_avoid_update(lo, lag);
                        lod_qos_ost_in_use(env, nfound, idx);
-                       lod_comp_ost_in_use(inuse, idx);
-                       stripe[nfound++] = o;
+                       stripe[nfound] = o;
+                       ost_indices[nfound] = idx;
                        lod_qos_used(lod, osts, idx, &total_weight);
+                       nfound++;
                        rc = 0;
                        break;
                }
@@ -1620,9 +1707,6 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                        dt_object_put(env, stripe[i]);
                        stripe[i] = NULL;
                }
-               LASSERTF(nfound <= inuse->op_count,
-                        "nfound:%d, op_count:%u\n", nfound, inuse->op_count);
-               inuse->op_count -= nfound;
 
                /* makes sense to rebalance next time */
                lod->lod_qos.lq_dirty = 1;
@@ -1730,18 +1814,23 @@ int lod_use_defined_striping(const struct lu_env *env,
        int     rc = 0, i;
        ENTRY;
 
+       mutex_lock(&mo->ldo_layout_mutex);
+       lod_striping_free_nolock(env, mo);
+
        magic = le32_to_cpu(v1->lmm_magic) & ~LOV_MAGIC_DEFINED;
 
        if (magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3 &&
            magic != LOV_MAGIC_COMP_V1)
-               RETURN(-EINVAL);
+               GOTO(unlock, rc = -EINVAL);
 
        if (magic == LOV_MAGIC_COMP_V1) {
                comp_v1 = buf->lb_buf;
                comp_cnt = le16_to_cpu(comp_v1->lcm_entry_count);
                if (comp_cnt == 0)
-                       RETURN(-EINVAL);
+                       GOTO(unlock, rc = -EINVAL);
                mirror_cnt = le16_to_cpu(comp_v1->lcm_mirror_count) + 1;
+               mo->ldo_flr_state = le16_to_cpu(comp_v1->lcm_flags) &
+                                       LCM_FL_FLR_MASK;
                mo->ldo_is_composite = 1;
        } else {
                mo->ldo_is_composite = 0;
@@ -1752,7 +1841,7 @@ int lod_use_defined_striping(const struct lu_env *env,
 
        rc = lod_alloc_comp_entries(mo, mirror_cnt, comp_cnt);
        if (rc)
-               RETURN(rc);
+               GOTO(unlock, rc);
 
        for (i = 0; i < comp_cnt; i++) {
                struct lu_extent *ext;
@@ -1812,11 +1901,12 @@ int lod_use_defined_striping(const struct lu_env *env,
        }
 
        rc = lod_fill_mirrors(mo);
-       if (rc)
-               GOTO(out, rc);
+       GOTO(out, rc);
 out:
        if (rc)
-               lod_object_free_striping(env, mo);
+               lod_striping_free_nolock(env, mo);
+unlock:
+       mutex_unlock(&mo->ldo_layout_mutex);
 
        RETURN(rc);
 }
@@ -1847,6 +1937,7 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
        struct lov_user_md_v1   *v1 = NULL;
        struct lov_user_md_v3   *v3 = NULL;
        struct lov_comp_md_v1   *comp_v1 = NULL;
+       char    def_pool[LOV_MAXPOOLNAME + 1];
        __u32   magic;
        __u16   comp_cnt;
        __u16   mirror_cnt;
@@ -1856,12 +1947,18 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
        if (buf == NULL || buf->lb_buf == NULL || buf->lb_len == 0)
                RETURN(0);
 
+       memset(def_pool, 0, sizeof(def_pool));
+       if (lo->ldo_comp_entries != NULL)
+               lod_layout_get_pool(lo->ldo_comp_entries, lo->ldo_comp_cnt,
+                                   def_pool, sizeof(def_pool));
+
+       /* free default striping info */
+       lod_free_comp_entries(lo);
+
        rc = lod_verify_striping(d, lo, buf, false);
        if (rc)
                RETURN(-EINVAL);
 
-       lod_free_comp_entries(lo);
-
        v3 = buf->lb_buf;
        v1 = buf->lb_buf;
        comp_v1 = buf->lb_buf;
@@ -1913,6 +2010,8 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
                if (comp_cnt == 0)
                        RETURN(-EINVAL);
                mirror_cnt =  comp_v1->lcm_mirror_count + 1;
+               if (mirror_cnt > 1)
+                       lo->ldo_flr_state = LCM_FL_RDONLY;
                lo->ldo_is_composite = 1;
        } else {
                comp_cnt = 1;
@@ -1924,6 +2023,8 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
        if (rc)
                RETURN(rc);
 
+       LASSERT(lo->ldo_comp_entries);
+
        for (i = 0; i < comp_cnt; i++) {
                struct pool_desc        *pool;
                struct lu_extent        *ext;
@@ -1936,38 +2037,28 @@ int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
                                        comp_v1->lcm_entries[i].lcme_offset);
                        ext = &comp_v1->lcm_entries[i].lcme_extent;
                        lod_comp->llc_extent = *ext;
+                       lod_comp->llc_flags =
+                               comp_v1->lcm_entries[i].lcme_flags &
+                                       LCME_USER_FLAGS;
                }
 
                pool_name = NULL;
                if (v1->lmm_magic == LOV_USER_MAGIC_V3 ||
                    v1->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
-                       int j;
-
                        v3 = (struct lov_user_md_v3 *)v1;
                        if (v3->lmm_pool_name[0] != '\0')
                                pool_name = v3->lmm_pool_name;
 
                        if (v3->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
-                               if (v3->lmm_stripe_offset == LOV_OFFSET_DEFAULT)
-                                       v3->lmm_stripe_offset =
-                                               v3->lmm_objects[0].l_ost_idx;
-
-                               /* copy ost list from lmm */
-                               lod_comp->llc_ostlist.op_count =
-                                       v3->lmm_stripe_count;
-                               lod_comp->llc_ostlist.op_size =
-                                       v3->lmm_stripe_count * sizeof(__u32);
-                               OBD_ALLOC(lod_comp->llc_ostlist.op_array,
-                                         lod_comp->llc_ostlist.op_size);
-                               if (!lod_comp->llc_ostlist.op_array)
-                                       GOTO(free_comp, rc = -ENOMEM);
-
-                               for (j = 0; j < v3->lmm_stripe_count; j++)
-                                       lod_comp->llc_ostlist.op_array[j] =
-                                               v3->lmm_objects[j].l_ost_idx;
+                               rc = lod_comp_copy_ost_lists(lod_comp, v3);
+                               if (rc)
+                                       GOTO(free_comp, rc);
                        }
                }
 
+               if (pool_name == NULL && def_pool[0] != '\0')
+                       pool_name = def_pool;
+
                if (v1->lmm_pattern == 0)
                        v1->lmm_pattern = LOV_PATTERN_RAID0;
                if (lov_pattern(v1->lmm_pattern) != LOV_PATTERN_RAID0 &&
@@ -2029,6 +2120,138 @@ free_comp:
 }
 
 /**
+ * prepare enough OST avoidance bitmap space
+ */
+int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_tgt_descs *ltds = &lod->lod_ost_descs;
+       struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
+       struct cfs_bitmap *bitmap = NULL;
+       __u32 *new_oss = NULL;
+
+       lag->lag_ost_avail = ltds->ltd_tgtnr;
+
+       /* reset OSS avoid guide array */
+       lag->lag_oaa_count = 0;
+       if (lag->lag_oss_avoid_array && lag->lag_oaa_size < ltds->ltd_tgtnr) {
+               OBD_FREE(lag->lag_oss_avoid_array,
+                        sizeof(__u32) * lag->lag_oaa_size);
+               lag->lag_oss_avoid_array = NULL;
+               lag->lag_oaa_size = 0;
+       }
+
+       /* init OST avoid guide bitmap */
+       if (lag->lag_ost_avoid_bitmap) {
+               if (ltds->ltd_tgtnr <= lag->lag_ost_avoid_bitmap->size) {
+                       CFS_RESET_BITMAP(lag->lag_ost_avoid_bitmap);
+               } else {
+                       CFS_FREE_BITMAP(lag->lag_ost_avoid_bitmap);
+                       lag->lag_ost_avoid_bitmap = NULL;
+               }
+       }
+
+       if (!lag->lag_ost_avoid_bitmap) {
+               bitmap = CFS_ALLOCATE_BITMAP(ltds->ltd_tgtnr);
+               if (!bitmap)
+                       return -ENOMEM;
+       }
+
+       if (!lag->lag_oss_avoid_array) {
+               /**
+                * usually there are multiple OSTs in one OSS, but we don't
+                * know the exact OSS number, so we choose a safe option,
+                * using OST count to allocate the array to store the OSS
+                * id.
+                */
+               OBD_ALLOC(new_oss, sizeof(*new_oss) * ltds->ltd_tgtnr);
+               if (!new_oss) {
+                       CFS_FREE_BITMAP(bitmap);
+                       return -ENOMEM;
+               }
+       }
+
+       if (new_oss) {
+               lag->lag_oss_avoid_array = new_oss;
+               lag->lag_oaa_size = ltds->ltd_tgtnr;
+       }
+       if (bitmap)
+               lag->lag_ost_avoid_bitmap = bitmap;
+
+       return 0;
+}
+
+/**
+ * Collect information of used OSTs and OSSs in the overlapped components
+ * of other mirrors
+ */
+void lod_collect_avoidance(struct lod_object *lo, struct lod_avoid_guide *lag,
+                          int comp_idx)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
+       struct cfs_bitmap *bitmap = lag->lag_ost_avoid_bitmap;
+       int i, j;
+
+       /* iterate mirrors */
+       for (i = 0; i < lo->ldo_mirror_count; i++) {
+               struct lod_layout_component *comp;
+
+               /**
+                * skip mirror containing component[comp_idx], we only
+                * collect OSTs info of conflicting component in other mirrors,
+                * so that during read, if OSTs of a mirror's component are
+                * not available, we still have other mirror with different
+                * OSTs to read the data.
+                */
+               comp = &lo->ldo_comp_entries[lo->ldo_mirrors[i].lme_start];
+               if (comp->llc_id == LCME_ID_INVAL ||
+                   mirror_id_of(comp->llc_id) ==
+                                               mirror_id_of(lod_comp->llc_id))
+                       continue;
+
+               /* iterate components of a mirror */
+               lod_foreach_mirror_comp(comp, lo, i) {
+                       /* skip non-overlapped or un-instantiated components */
+                       if (!lu_extent_is_overlapped(&comp->llc_extent,
+                                                    &lod_comp->llc_extent) ||
+                           !lod_comp_inited(comp) || !comp->llc_stripe)
+                               continue;
+
+                       /**
+                        * collect used OSTs index and OSS info from a
+                        * component
+                        */
+                       for (j = 0; j < comp->llc_stripe_count; j++) {
+                               struct lod_tgt_desc *ost;
+                               struct lod_qos_oss *lqo;
+                               int k;
+
+                               ost = OST_TGT(lod, comp->llc_ost_indices[j]);
+                               lqo = ost->ltd_qos.ltq_oss;
+
+                               if (cfs_bitmap_check(bitmap, ost->ltd_index))
+                                       continue;
+
+                               cfs_bitmap_set(bitmap, ost->ltd_index);
+                               lag->lag_ost_avail--;
+
+                               for (k = 0; k < lag->lag_oaa_count; k++) {
+                                       if (lag->lag_oss_avoid_array[k] ==
+                                           lqo->lqo_id)
+                                               break;
+                               }
+                               if (k == lag->lag_oaa_count) {
+                                       lag->lag_oss_avoid_array[k] =
+                                                               lqo->lqo_id;
+                                       lag->lag_oaa_count++;
+                               }
+                       }
+               }
+       }
+}
+
+/**
  * Create a striping for an obejct.
  *
  * The function creates a new striping for the object. The function tries QoS
@@ -2042,21 +2265,22 @@ free_comp:
  * \param[in] attr     attributes OST objects will be declared with
  * \param[in] th       transaction handle
  * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out] inuse        array of inuse ost index
  *
  * \retval 0           on success
  * \retval negative    negated errno on error
  */
 int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
                        struct lu_attr *attr, struct thandle *th,
-                       int comp_idx, struct ost_pool *inuse)
+                       int comp_idx)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device      *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
-       struct dt_object      **stripe;
        int                     stripe_len;
        int                     flag = LOV_USES_ASSIGNED_STRIPE;
        int                     i, rc = 0;
+       struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
+       struct dt_object **stripe = NULL;
+       __u32 *ost_indices = NULL;
        ENTRY;
 
        LASSERT(lo);
@@ -2089,6 +2313,9 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
                OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_len);
                if (stripe == NULL)
                        GOTO(out, rc = -ENOMEM);
+               OBD_ALLOC(ost_indices, sizeof(*ost_indices) * stripe_len);
+               if (!ost_indices)
+                       GOTO(out, rc = -ENOMEM);
 
                lod_getref(&d->lod_ost_descs);
                /* XXX: support for non-0 files w/o objects */
@@ -2096,29 +2323,38 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
                                d->lod_desc.ld_tgt_count, stripe_len);
 
                if (lod_comp->llc_ostlist.op_array) {
-                       rc = lod_alloc_ost_list(env, lo, stripe, th, comp_idx,
-                                               inuse);
+                       rc = lod_alloc_ost_list(env, lo, stripe, ost_indices,
+                                               th, comp_idx);
                } else if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT) {
-                       rc = lod_alloc_qos(env, lo, stripe, flag, th,
-                                          comp_idx, inuse);
+                       /**
+                        * collect OSTs and OSSs used in other mirrors whose
+                        * components cross the ldo_comp_entries[comp_idx]
+                        */
+                       rc = lod_prepare_avoidance(env, lo);
+                       if (rc)
+                               GOTO(put_ldts, rc);
+
+                       lod_collect_avoidance(lo, lag, comp_idx);
+
+                       rc = lod_alloc_qos(env, lo, stripe, ost_indices, flag,
+                                          th, comp_idx);
                        if (rc == -EAGAIN)
-                               rc = lod_alloc_rr(env, lo, stripe, flag, th,
-                                                 comp_idx, inuse);
+                               rc = lod_alloc_rr(env, lo, stripe, ost_indices,
+                                                 flag, th, comp_idx);
                } else {
-                       rc = lod_alloc_specific(env, lo, stripe, flag, th,
-                                               comp_idx, inuse);
+                       rc = lod_alloc_specific(env, lo, stripe, ost_indices,
+                                               flag, th, comp_idx);
                }
+put_ldts:
                lod_putref(d, &d->lod_ost_descs);
-
                if (rc < 0) {
                        for (i = 0; i < stripe_len; i++)
                                if (stripe[i] != NULL)
                                        dt_object_put(env, stripe[i]);
-
-                       OBD_FREE(stripe, sizeof(stripe[0]) * stripe_len);
                        lod_comp->llc_stripe_count = 0;
                } else {
                        lod_comp->llc_stripe = stripe;
+                       lod_comp->llc_ost_indices = ost_indices;
                        lod_comp->llc_stripes_allocated = stripe_len;
                }
        } else {
@@ -2149,88 +2385,14 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
        }
 
 out:
-       RETURN(rc);
-}
-
-int lod_obj_stripe_set_inuse_cb(const struct lu_env *env,
-                               struct lod_object *lo,
-                               struct dt_object *dt, struct thandle *th,
-                               int stripe_idx,
-                               struct lod_obj_stripe_cb_data *data)
-{
-       struct lod_thread_info  *info = lod_env_info(env);
-       struct lod_device       *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
-       struct lu_fid   *fid = &info->lti_fid;
-       __u32   index;
-       int     rc, type = LU_SEQ_RANGE_OST;
-
-       *fid = *lu_object_fid(&dt->do_lu);
-       rc = lod_fld_lookup(env, d, fid, &index, &type);
        if (rc < 0) {
-               CERROR("%s: fail to locate "DFID": rc = %d\n",
-                      lod2obd(d)->obd_name, PFID(fid), rc);
-               return rc;
+               if (stripe)
+                       OBD_FREE(stripe, sizeof(stripe[0]) * stripe_len);
+               if (ost_indices)
+                       OBD_FREE(ost_indices,
+                                sizeof(*ost_indices) * stripe_len);
        }
-       lod_comp_ost_in_use(data->locd_inuse, index);
-       return 0;
-}
-
-/**
- * Resize per-thread ost list to hold OST target index list already used.
- *
- * \param[in,out] inuse                structure contains ost list array
- * \param[in] cnt              total stripe count of all components
- * \param[in] max              array's max size if @max > 0
- *
- * \retval 0           on success
- * \retval -ENOMEM     reallocation failed
- */
-static int lod_inuse_resize(struct ost_pool *inuse, __u16 cnt, __u16 max)
-{
-       __u32 *array;
-       __u32 new = cnt * sizeof(inuse->op_array[0]);
-
-       inuse->op_count = 0;
-
-       if (new <= inuse->op_size)
-               return 0;
-
-       if (max)
-               new = min_t(__u32, new, max);
-
-       OBD_ALLOC(array, new);
-       if (!array)
-               return -ENOMEM;
-
-       if (inuse->op_array)
-               OBD_FREE(inuse->op_array, inuse->op_size);
-
-       inuse->op_array = array;
-       inuse->op_size = new;
-
-       return 0;
-}
-
-int lod_prepare_inuse(const struct lu_env *env, struct lod_object *lo)
-{
-       struct lod_thread_info *info = lod_env_info(env);
-       struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
-       struct ost_pool *inuse = &info->lti_inuse_osts;
-       struct lod_obj_stripe_cb_data data;
-       __u32 stripe_count = 0;
-       int i;
-       int rc;
-
-       for (i = 0; i < lo->ldo_comp_cnt; i++)
-               stripe_count += lod_comp_entry_stripe_count(lo,
-                                       &lo->ldo_comp_entries[i], false);
-       rc = lod_inuse_resize(inuse, stripe_count, d->lod_osd_max_easize);
-       if (rc)
-               return rc;
-
-       data.locd_inuse = inuse;
-       return lod_obj_for_each_stripe(env, lo, NULL,
-                                      lod_obj_stripe_set_inuse_cb, &data);
+       RETURN(rc);
 }
 
 int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
@@ -2238,10 +2400,7 @@ int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
                       struct thandle *th)
 
 {
-       struct lod_thread_info *info = lod_env_info(env);
        struct lod_device *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
-       struct ost_pool inuse_osts = { 0 };
-       struct ost_pool *inuse = &inuse_osts;
        uint64_t size = 0;
        int i;
        int rc;
@@ -2270,14 +2429,6 @@ int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
        if (attr->la_valid & LA_SIZE)
                size = attr->la_size;
 
-       /* only prepare inuse if multiple components to be created */
-       if (size && lo->ldo_is_composite) {
-               rc = lod_prepare_inuse(env, lo);
-               if (rc)
-                       RETURN(rc);
-               inuse = &info->lti_inuse_osts;
-       }
-
        /**
         * prepare OST object creation for the component covering file's
         * size, the 1st component (including plain layout file) is always
@@ -2292,7 +2443,7 @@ int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
                CDEBUG(D_QOS, "%lld [%lld, %lld)\n",
                       size, extent->e_start, extent->e_end);
                if (!lo->ldo_is_composite || size >= extent->e_start) {
-                       rc = lod_qos_prep_create(env, lo, attr, th, i, inuse);
+                       rc = lod_qos_prep_create(env, lo, attr, th, i);
                        if (rc)
                                break;
                }