Whamcloud - gitweb
LU-9007 lod: improve obj alloc for FLR file 04/32404/10
authorBobi Jam <bobijam@whamcloud.com>
Mon, 14 May 2018 11:10:24 +0000 (19:10 +0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 24 Jul 2018 16:01:46 +0000 (16:01 +0000)
* add lod_layout_component::llc_ost_indices to track the map
  of dt_object to its OST index.
* add lod_device::lod_avoid to collect information of objects on other
  mirrors which overlapped the target component
* lod_should_avoid_ost() use the avoid guidance information to avoid
  allocating objects on the same OST for different mirrors.

Change-Id: Ib7e155e4b02c2e25d3955aa9a4acff7569ab7d8f
Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/32404
Reviewed-by: Jinshan Xiong <jinshan.xiong@gmail.com>
Reviewed-by: Patrick Farrell <paf@cray.com>
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/lod/lod_dev.c
lustre/lod/lod_internal.h
lustre/lod/lod_lov.c
lustre/lod/lod_object.c
lustre/lod/lod_qos.c
lustre/tests/sanity-flr.sh

index eb7192c..5a5eb10 100644 (file)
@@ -1727,6 +1727,15 @@ static struct lu_device *lod_device_alloc(const struct lu_env *env,
        return lu_dev;
 }
 
        return lu_dev;
 }
 
+static void lod_avoid_guide_fini(struct lod_avoid_guide *lag)
+{
+       if (lag->lag_oss_avoid_array)
+               OBD_FREE(lag->lag_oss_avoid_array,
+                        sizeof(__u32) * lag->lag_oaa_size);
+       if (lag->lag_ost_avoid_bitmap)
+               CFS_FREE_BITMAP(lag->lag_ost_avoid_bitmap);
+}
+
 /**
  * Implementation of lu_device_type_operations::ldto_device_fini() for LOD
  *
 /**
  * Implementation of lu_device_type_operations::ldto_device_fini() for LOD
  *
@@ -1874,6 +1883,8 @@ static void lod_key_fini(const struct lu_context *ctx,
                OBD_FREE(info->lti_comp_idx,
                         info->lti_comp_size * sizeof(__u32));
 
                OBD_FREE(info->lti_comp_idx,
                         info->lti_comp_size * sizeof(__u32));
 
+       lod_avoid_guide_fini(&info->lti_avoid);
+
        OBD_FREE_PTR(info);
 }
 
        OBD_FREE_PTR(info);
 }
 
index c2b9346..cb683f1 100644 (file)
@@ -102,6 +102,7 @@ struct lod_qos_oss {
                                                         every obj*/
        time64_t                 lqo_used;      /* last used time, seconds */
        __u32                    lqo_ost_count; /* number of osts on this oss */
                                                         every obj*/
        time64_t                 lqo_used;      /* last used time, seconds */
        __u32                    lqo_ost_count; /* number of osts on this oss */
+       __u32                    lqo_id;        /* unique oss id */
 };
 
 struct ltd_qos {
 };
 
 struct ltd_qos {
@@ -163,6 +164,19 @@ struct lod_tgt_descs {
        struct rw_semaphore     ltd_rw_sem;
 };
 
        struct rw_semaphore     ltd_rw_sem;
 };
 
+struct lod_avoid_guide {
+       /* ids of OSSs avoid guidance */
+       __u32                   *lag_oss_avoid_array;
+       /* number of filled array items */
+       unsigned int            lag_oaa_count;
+       /* number of allocated array items */
+       unsigned int            lag_oaa_size;
+       /* bitmap of OSTs avoid guidance */
+       struct cfs_bitmap       *lag_ost_avoid_bitmap;
+       /* how many OSTs are available for alloc */
+       __u32                   lag_ost_avail;
+};
+
 struct lod_device {
        struct dt_device      lod_dt_dev;
        struct obd_export    *lod_child_exp;
 struct lod_device {
        struct dt_device      lod_dt_dev;
        struct obd_export    *lod_child_exp;
@@ -244,6 +258,7 @@ struct lod_layout_component {
        /* ost list specified with LOV_USER_MAGIC_SPECIFIC lum */
        struct ost_pool           llc_ostlist;
        struct dt_object        **llc_stripe;
        /* ost list specified with LOV_USER_MAGIC_SPECIFIC lum */
        struct ost_pool           llc_ostlist;
        struct dt_object        **llc_stripe;
+       __u32                    *llc_ost_indices;
 };
 
 struct lod_default_striping {
 };
 
 struct lod_default_striping {
@@ -320,6 +335,19 @@ struct lod_object {
        struct dt_object                **ldo_stripe;
 };
 
        struct dt_object                **ldo_stripe;
 };
 
+#define lod_foreach_mirror_comp(comp, lo, mirror_idx)                      \
+for (comp = &lo->ldo_comp_entries[lo->ldo_mirrors[mirror_idx].lme_start];  \
+     comp <= &lo->ldo_comp_entries[lo->ldo_mirrors[mirror_idx].lme_end];   \
+     comp++)
+
+static inline bool lod_is_flr(const struct lod_object *lo)
+{
+       if (!lo->ldo_is_composite)
+               return false;
+
+       return (lo->ldo_flr_state & LCM_FL_FLR_MASK) != LCM_FL_NONE;
+}
+
 static inline int lod_set_pool(char **pool, const char *new_pool)
 {
        int len;
 static inline int lod_set_pool(char **pool, const char *new_pool)
 {
        int len;
@@ -413,6 +441,8 @@ struct lod_thread_info {
        size_t                          lti_comp_size;
        size_t                          lti_count;
        struct lu_attr                  lti_layout_attr;
        size_t                          lti_comp_size;
        size_t                          lti_count;
        struct lu_attr                  lti_layout_attr;
+       /* object allocation avoid guide info */
+       struct lod_avoid_guide          lti_avoid;
 };
 
 extern const struct lu_device_operations lod_lu_ops;
 };
 
 extern const struct lu_device_operations lod_lu_ops;
index cd96cff..ec6d18d 100644 (file)
@@ -1121,15 +1121,16 @@ static int validate_lod_and_idx(struct lod_device *md, __u32 idx)
 int lod_initialize_objects(const struct lu_env *env, struct lod_object *lo,
                           struct lov_ost_data_v1 *objs, int comp_idx)
 {
 int lod_initialize_objects(const struct lu_env *env, struct lod_object *lo,
                           struct lov_ost_data_v1 *objs, int comp_idx)
 {
-       struct lod_layout_component     *lod_comp;
-       struct lod_thread_info  *info = lod_env_info(env);
-       struct lod_device       *md;
-       struct lu_object        *o, *n;
-       struct lu_device        *nd;
-       struct dt_object       **stripe;
-       int                      stripe_len;
-       int                      i, rc = 0;
-       __u32                   idx;
+       struct lod_layout_component *lod_comp;
+       struct lod_thread_info *info = lod_env_info(env);
+       struct lod_device *md;
+       struct lu_object *o, *n;
+       struct lu_device *nd;
+       struct dt_object **stripe = NULL;
+       __u32 *ost_indices = NULL;
+       int stripe_len;
+       int i, rc = 0;
+       __u32 idx;
        ENTRY;
 
        LASSERT(lo != NULL);
        ENTRY;
 
        LASSERT(lo != NULL);
@@ -1146,6 +1147,9 @@ int lod_initialize_objects(const struct lu_env *env, struct lod_object *lo,
        OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_len);
        if (stripe == NULL)
                RETURN(-ENOMEM);
        OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_len);
        if (stripe == NULL)
                RETURN(-ENOMEM);
+       OBD_ALLOC(ost_indices, sizeof(*ost_indices) * stripe_len);
+       if (!ost_indices)
+               GOTO(out, rc = -ENOMEM);
 
        for (i = 0; i < lod_comp->llc_stripe_count; i++) {
                if (unlikely(lovea_slot_is_dummy(&objs[i])))
 
        for (i = 0; i < lod_comp->llc_stripe_count; i++) {
                if (unlikely(lovea_slot_is_dummy(&objs[i])))
@@ -1180,6 +1184,7 @@ int lod_initialize_objects(const struct lu_env *env, struct lod_object *lo,
                LASSERT(n);
 
                stripe[i] = container_of(n, struct dt_object, do_lu);
                LASSERT(n);
 
                stripe[i] = container_of(n, struct dt_object, do_lu);
+               ost_indices[i] = idx;
        }
 
 out:
        }
 
 out:
@@ -1190,8 +1195,12 @@ out:
 
                OBD_FREE(stripe, sizeof(stripe[0]) * stripe_len);
                lod_comp->llc_stripe_count = 0;
 
                OBD_FREE(stripe, sizeof(stripe[0]) * stripe_len);
                lod_comp->llc_stripe_count = 0;
+               if (ost_indices)
+                       OBD_FREE(ost_indices,
+                                sizeof(*ost_indices) * stripe_len);
        } else {
                lod_comp->llc_stripe = stripe;
        } else {
                lod_comp->llc_stripe = stripe;
+               lod_comp->llc_ost_indices = ost_indices;
                lod_comp->llc_stripes_allocated = stripe_len;
        }
 
                lod_comp->llc_stripes_allocated = stripe_len;
        }
 
index 7bcc4f2..6eef5f0 100644 (file)
@@ -3683,6 +3683,9 @@ static int lod_layout_del(const struct lu_env *env, struct dt_object *dt,
                OBD_FREE(lod_comp->llc_stripe, sizeof(struct dt_object *) *
                                        lod_comp->llc_stripes_allocated);
                lod_comp->llc_stripe = NULL;
                OBD_FREE(lod_comp->llc_stripe, sizeof(struct dt_object *) *
                                        lod_comp->llc_stripes_allocated);
                lod_comp->llc_stripe = NULL;
+               OBD_FREE(lod_comp->llc_ost_indices,
+                        sizeof(__u32) * lod_comp->llc_stripes_allocated);
+               lod_comp->llc_ost_indices = NULL;
                lod_comp->llc_stripes_allocated = 0;
                lod_obj_set_pool(lo, i, NULL);
                if (lod_comp->llc_ostlist.op_array) {
                lod_comp->llc_stripes_allocated = 0;
                lod_obj_set_pool(lo, i, NULL);
                if (lod_comp->llc_ostlist.op_array) {
@@ -5490,11 +5493,6 @@ out:
        RETURN(rc);
 }
 
        RETURN(rc);
 }
 
-#define lod_foreach_mirror_comp(comp, lo, mirror_idx)                      \
-for (comp = &lo->ldo_comp_entries[lo->ldo_mirrors[mirror_idx].lme_start];  \
-     comp <= &lo->ldo_comp_entries[lo->ldo_mirrors[mirror_idx].lme_end];   \
-     comp++)
-
 static inline int lod_comp_index(struct lod_object *lo,
                                 struct lod_layout_component *lod_comp)
 {
 static inline int lod_comp_index(struct lod_object *lo,
                                 struct lod_layout_component *lod_comp)
 {
@@ -6369,6 +6367,10 @@ void lod_object_free_striping(const struct lu_env *env, struct lod_object *lo)
                                 sizeof(struct dt_object *) *
                                 lod_comp->llc_stripes_allocated);
                        lod_comp->llc_stripe = NULL;
                                 sizeof(struct dt_object *) *
                                 lod_comp->llc_stripes_allocated);
                        lod_comp->llc_stripe = NULL;
+                       OBD_FREE(lod_comp->llc_ost_indices,
+                                sizeof(__u32) *
+                                lod_comp->llc_stripes_allocated);
+                       lod_comp->llc_ost_indices = NULL;
                        lod_comp->llc_stripes_allocated = 0;
                }
                lod_free_comp_entries(lo);
                        lod_comp->llc_stripes_allocated = 0;
                }
                lod_free_comp_entries(lo);
index e00b172..9eb1297 100644 (file)
@@ -77,6 +77,7 @@ int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
        struct obd_export  *exp = ost_desc->ltd_exp;
        int                 rc = 0, found = 0;
        struct list_head   *list;
        struct obd_export  *exp = ost_desc->ltd_exp;
        int                 rc = 0, found = 0;
        struct list_head   *list;
+       __u32 id = 0;
        ENTRY;
 
        down_write(&lod->lod_qos.lq_rw_sem);
        ENTRY;
 
        down_write(&lod->lod_qos.lq_rw_sem);
@@ -91,6 +92,8 @@ int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
                        found++;
                        break;
                }
                        found++;
                        break;
                }
+               if (oss->lqo_id > id)
+                       id = oss->lqo_id;
        }
 
        if (!found) {
        }
 
        if (!found) {
@@ -99,6 +102,8 @@ int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
                        GOTO(out, rc = -ENOMEM);
                memcpy(&oss->lqo_uuid, &exp->exp_connection->c_remote_uuid,
                       sizeof(oss->lqo_uuid));
                        GOTO(out, rc = -ENOMEM);
                memcpy(&oss->lqo_uuid, &exp->exp_connection->c_remote_uuid,
                       sizeof(oss->lqo_uuid));
+               ++id;
+               oss->lqo_id = id;
        } else {
                /* Assume we have to move this one */
                list_del(&oss->lqo_oss_list);
        } else {
                /* Assume we have to move this one */
                list_del(&oss->lqo_oss_list);
@@ -864,19 +869,74 @@ static inline void lod_comp_ost_in_use(struct ost_pool *inuse, int ost)
        }
 }
 
        }
 }
 
+static inline void lod_avoid_update(struct lod_object *lo,
+                                   struct lod_avoid_guide *lag)
+{
+       if (!lod_is_flr(lo))
+               return;
+
+       lag->lag_ost_avail--;
+}
+
+static inline bool lod_should_avoid_ost(struct lod_object *lo,
+                                       struct lod_avoid_guide *lag,
+                                       __u32 index)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_tgt_desc *ost = OST_TGT(lod, index);
+       struct lod_qos_oss *lqo = ost->ltd_qos.ltq_oss;
+       bool used = false;
+       int i;
+
+       if (!cfs_bitmap_check(lod->lod_ost_bitmap, index))
+               return true;
+
+       /**
+        * we've tried our best, all available OSTs have been used in
+        * overlapped components in the other mirror
+        */
+       if (lag->lag_ost_avail == 0)
+               return false;
+
+       /* check OSS use */
+       for (i = 0; i < lag->lag_oaa_count; i++) {
+               if (lag->lag_oss_avoid_array[i] == lqo->lqo_id) {
+                       used = true;
+                       break;
+               }
+       }
+       /**
+        * if the OSS which OST[index] resides has not been used, we'd like to
+        * use it
+        */
+       if (!used)
+               return false;
+
+       /* if the OSS has been used, check whether the OST has been used */
+       if (!cfs_bitmap_check(lag->lag_ost_avoid_bitmap, index))
+               used = false;
+       else
+               QOS_DEBUG("OST%d: has been used in overlapped component "
+                         "in other mirror\n", index);
+       return used;
+}
+
 static int lod_check_and_reserve_ost(const struct lu_env *env,
 static int lod_check_and_reserve_ost(const struct lu_env *env,
-                                    struct lod_device *m,
+                                    struct lod_object *lo,
                                     struct obd_statfs *sfs, __u32 ost_idx,
                                     __u32 speed, __u32 *s_idx,
                                     struct dt_object **stripe,
                                     struct obd_statfs *sfs, __u32 ost_idx,
                                     __u32 speed, __u32 *s_idx,
                                     struct dt_object **stripe,
+                                    __u32 *ost_indices,
                                     struct thandle *th,
                                     struct ost_pool *inuse)
 {
                                     struct thandle *th,
                                     struct ost_pool *inuse)
 {
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
        struct dt_object   *o;
        __u32 stripe_idx = *s_idx;
        int rc;
 
        struct dt_object   *o;
        __u32 stripe_idx = *s_idx;
        int rc;
 
-       rc = lod_statfs_and_check(env, m, ost_idx, sfs);
+       rc = lod_statfs_and_check(env, lod, ost_idx, sfs);
        if (rc) {
                /* this OSP doesn't feel well */
                goto out_return;
        if (rc) {
                /* this OSP doesn't feel well */
                goto out_return;
@@ -908,13 +968,22 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
                goto out_return;
        }
 
                goto out_return;
        }
 
+       /**
+        * try not allocate OSTs used by conflicting component of other mirrors
+        * for the first and second time.
+        */
+       if (speed < 2 && lod_should_avoid_ost(lo, lag, ost_idx)) {
+               QOS_DEBUG("#%d: used by overlapped component of other mirror\n",
+                         ost_idx);
+               goto out_return;
+       }
        /*
         * do not put >1 objects on a single OST
         */
        if (lod_qos_is_ost_used(env, ost_idx, stripe_idx))
                goto out_return;
 
        /*
         * do not put >1 objects on a single OST
         */
        if (lod_qos_is_ost_used(env, ost_idx, stripe_idx))
                goto out_return;
 
-       o = lod_qos_declare_object_on(env, m, ost_idx, th);
+       o = lod_qos_declare_object_on(env, lod, ost_idx, th);
        if (IS_ERR(o)) {
                CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
                       ost_idx, (int) PTR_ERR(o));
        if (IS_ERR(o)) {
                CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
                       ost_idx, (int) PTR_ERR(o));
@@ -925,9 +994,11 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
        /*
         * We've successfully declared (reserved) an object
         */
        /*
         * We've successfully declared (reserved) an object
         */
+       lod_avoid_update(lo, lag);
        lod_qos_ost_in_use(env, stripe_idx, ost_idx);
        lod_comp_ost_in_use(inuse, ost_idx);
        stripe[stripe_idx] = o;
        lod_qos_ost_in_use(env, stripe_idx, ost_idx);
        lod_comp_ost_in_use(inuse, ost_idx);
        stripe[stripe_idx] = o;
+       ost_indices[stripe_idx] = ost_idx;
        OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LOV_CREATE_RACE, 2);
        stripe_idx++;
        *s_idx = stripe_idx;
        OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LOV_CREATE_RACE, 2);
        stripe_idx++;
        *s_idx = stripe_idx;
@@ -950,21 +1021,22 @@ out_return:
  * time we give priority to targets which already have objects precreated.
  * Full OSTs are skipped (see lod_qos_dev_is_full() for the details).
  *
  * time we give priority to targets which already have objects precreated.
  * Full OSTs are skipped (see lod_qos_dev_is_full() for the details).
  *
- * \param[in] env      execution environment for this thread
- * \param[in] lo       LOD object
- * \param[out] stripe  striping created
- * \param[in] flags    allocation flags (0 or LOV_USES_DEFAULT_STRIPE)
- * \param[in] th       transaction handle
- * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out] inuse        array of inuse ost index
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ * \param[out] ost_indices     ost indices of striping created
+ * \param[in] flags            allocation flags (0 or LOV_USES_DEFAULT_STRIPE)
+ * \param[in] th               transaction handle
+ * \param[in] comp_idx         index of ldo_comp_entries
+ * \param[in|out] inuse                array of inuse ost index
  *
  * \retval 0           on success
  * \retval -ENOSPC     if not enough OSTs are found
  * \retval negative    negated errno for other failures
  */
 static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
  *
  * \retval 0           on success
  * \retval -ENOSPC     if not enough OSTs are found
  * \retval negative    negated errno for other failures
  */
 static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
-                       struct dt_object **stripe, int flags,
-                       struct thandle *th, int comp_idx,
+                       struct dt_object **stripe, __u32 *ost_indices,
+                       int flags, struct thandle *th, int comp_idx,
                        struct ost_pool *inuse)
 {
        struct lod_layout_component *lod_comp;
                        struct ost_pool *inuse)
 {
        struct lod_layout_component *lod_comp;
@@ -1051,8 +1123,9 @@ repeat_find:
                        continue;
 
                spin_unlock(&lqr->lqr_alloc);
                        continue;
 
                spin_unlock(&lqr->lqr_alloc);
-               rc = lod_check_and_reserve_ost(env, m, sfs, ost_idx, speed,
-                                              &stripe_idx, stripe, th, inuse);
+               rc = lod_check_and_reserve_ost(env, lo, sfs, ost_idx, speed,
+                                              &stripe_idx, stripe, ost_indices,
+                                              th, inuse);
                spin_lock(&lqr->lqr_alloc);
 
                if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting)
                spin_lock(&lqr->lqr_alloc);
 
                if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting)
@@ -1105,12 +1178,13 @@ out:
  * structures are protected, but no concurrent allocation is allowed on the
  * same objects.
  *
  * structures are protected, but no concurrent allocation is allowed on the
  * same objects.
  *
- * \param[in] env      execution environment for this thread
- * \param[in] lo       LOD object
- * \param[out] stripe  striping created
- * \param[in] th       transaction handle
- * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out] inuse        array of inuse ost index
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ * \param[out] ost_indices     ost indices of striping created
+ * \param[in] th               transaction handle
+ * \param[in] comp_idx         index of ldo_comp_entries
+ * \param[in|out] inuse                array of inuse ost index
  *
  * \retval 0           on success
  * \retval -ENODEV     OST index does not exist on file system
  *
  * \retval 0           on success
  * \retval -ENODEV     OST index does not exist on file system
@@ -1118,8 +1192,9 @@ out:
  * \retval negative    negated errno on error
  */
 static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
  * \retval negative    negated errno on error
  */
 static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
-                             struct dt_object **stripe, struct thandle *th,
-                             int comp_idx, struct ost_pool *inuse)
+                             struct dt_object **stripe, __u32 *ost_indices,
+                             struct thandle *th, int comp_idx,
+                             struct ost_pool *inuse)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device       *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
 {
        struct lod_layout_component *lod_comp;
        struct lod_device       *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
@@ -1190,6 +1265,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
                lod_qos_ost_in_use(env, stripe_count, ost_idx);
                lod_comp_ost_in_use(inuse, ost_idx);
                stripe[stripe_count] = o;
                lod_qos_ost_in_use(env, stripe_count, ost_idx);
                lod_comp_ost_in_use(inuse, ost_idx);
                stripe[stripe_count] = o;
+               ost_indices[stripe_count] = ost_idx;
                stripe_count++;
        }
 
                stripe_count++;
        }
 
@@ -1208,13 +1284,14 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
  * release the stripes allocated. All the internal structures are protected,
  * but no concurrent allocation is allowed on the same objects.
  *
  * release the stripes allocated. All the internal structures are protected,
  * but no concurrent allocation is allowed on the same objects.
  *
- * \param[in] env      execution environment for this thread
- * \param[in] lo       LOD object
- * \param[out] stripe  striping created
- * \param[in] flags    not used
- * \param[in] th       transaction handle
- * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out]inuse array of inuse ost index
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ * \param[out] ost_indices     ost indices of striping created
+ * \param[in] flags            not used
+ * \param[in] th               transaction handle
+ * \param[in] comp_idx         index of ldo_comp_entries
+ * \param[in|out]inuse         array of inuse ost index
  *
  * \retval 0           on success
  * \retval -ENOSPC     if no OST objects are available at all
  *
  * \retval 0           on success
  * \retval -ENOSPC     if no OST objects are available at all
@@ -1223,8 +1300,8 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo,
  * \retval negative    errno on failure
  */
 static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo,
  * \retval negative    errno on failure
  */
 static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo,
-                             struct dt_object **stripe, int flags,
-                             struct thandle *th, int comp_idx,
+                             struct dt_object **stripe, __u32 *ost_indices,
+                             int flags, struct thandle *th, int comp_idx,
                              struct ost_pool *inuse)
 {
        struct lod_layout_component *lod_comp;
                              struct ost_pool *inuse)
 {
        struct lod_layout_component *lod_comp;
@@ -1332,6 +1409,7 @@ repeat_find:
                lod_qos_ost_in_use(env, stripe_num, ost_idx);
                lod_comp_ost_in_use(inuse, ost_idx);
                stripe[stripe_num] = o;
                lod_qos_ost_in_use(env, stripe_num, ost_idx);
                lod_comp_ost_in_use(inuse, ost_idx);
                stripe[stripe_num] = o;
+               ost_indices[stripe_num] = ost_idx;
                stripe_num++;
 
                /* We have enough stripes */
                stripe_num++;
 
                /* We have enough stripes */
@@ -1414,13 +1492,14 @@ static inline int lod_qos_is_usable(struct lod_device *lod)
  * An OST with a higher weight is proportionately more likely to be selected
  * than one with a lower weight.
  *
  * An OST with a higher weight is proportionately more likely to be selected
  * than one with a lower weight.
  *
- * \param[in] env      execution environment for this thread
- * \param[in] lo       LOD object
- * \param[out] stripe  striping created
- * \param[in] flags    0 or LOV_USES_DEFAULT_STRIPE
- * \param[in] th       transaction handle
- * \param[in] comp_idx index of ldo_comp_entries
- * \param[in|out]inuse array of inuse ost index
+ * \param[in] env              execution environment for this thread
+ * \param[in] lo               LOD object
+ * \param[out] stripe          striping created
+ * \param[out] ost_indices     ost indices of striping created
+ * \param[in] flags            0 or LOV_USES_DEFAULT_STRIPE
+ * \param[in] th               transaction handle
+ * \param[in] comp_idx         index of ldo_comp_entries
+ * \param[in|out]inuse         array of inuse ost index
  *
  * \retval 0           on success
  * \retval -EAGAIN     not enough OSTs are found for specified stripe count
  *
  * \retval 0           on success
  * \retval -EAGAIN     not enough OSTs are found for specified stripe count
@@ -1428,13 +1507,14 @@ static inline int lod_qos_is_usable(struct lod_device *lod)
  * \retval negative    errno on failure
  */
 static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
  * \retval negative    errno on failure
  */
 static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
-                        struct dt_object **stripe, int flags,
-                        struct thandle *th, int comp_idx,
+                        struct dt_object **stripe, __u32 *ost_indices,
+                        int flags, struct thandle *th, int comp_idx,
                         struct ost_pool *inuse)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
        struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
                         struct ost_pool *inuse)
 {
        struct lod_layout_component *lod_comp;
        struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
        struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
+       struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
        struct lod_tgt_desc *ost;
        struct dt_object *o;
        __u64 total_weight = 0;
        struct lod_tgt_desc *ost;
        struct dt_object *o;
        __u64 total_weight = 0;
@@ -1563,7 +1643,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                for (i = 0; i < osts->op_count; i++) {
                        __u32 idx = osts->op_array[i];
 
                for (i = 0; i < osts->op_count; i++) {
                        __u32 idx = osts->op_array[i];
 
-                       if (!cfs_bitmap_check(lod->lod_ost_bitmap, idx))
+                       if (lod_should_avoid_ost(lo, lag, idx))
                                continue;
 
                        ost = OST_TGT(lod, idx);
                                continue;
 
                        ost = OST_TGT(lod, idx);
@@ -1595,10 +1675,13 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                                continue;
                        }
 
                                continue;
                        }
 
+                       lod_avoid_update(lo, lag);
                        lod_qos_ost_in_use(env, nfound, idx);
                        lod_comp_ost_in_use(inuse, idx);
                        lod_qos_ost_in_use(env, nfound, idx);
                        lod_comp_ost_in_use(inuse, idx);
-                       stripe[nfound++] = o;
+                       stripe[nfound] = o;
+                       ost_indices[nfound] = idx;
                        lod_qos_used(lod, osts, idx, &total_weight);
                        lod_qos_used(lod, osts, idx, &total_weight);
+                       nfound++;
                        rc = 0;
                        break;
                }
                        rc = 0;
                        break;
                }
@@ -2052,6 +2135,138 @@ free_comp:
 }
 
 /**
 }
 
 /**
+ * prepare enough OST avoidance bitmap space
+ */
+int lod_prepare_avoidance(const struct lu_env *env, struct lod_object *lo)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_tgt_descs *ltds = &lod->lod_ost_descs;
+       struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
+       struct cfs_bitmap *bitmap = NULL;
+       __u32 *new_oss = NULL;
+
+       lag->lag_ost_avail = ltds->ltd_tgtnr;
+
+       /* reset OSS avoid guide array */
+       lag->lag_oaa_count = 0;
+       if (lag->lag_oss_avoid_array && lag->lag_oaa_size < ltds->ltd_tgtnr) {
+               OBD_FREE(lag->lag_oss_avoid_array,
+                        sizeof(__u32) * lag->lag_oaa_size);
+               lag->lag_oss_avoid_array = NULL;
+               lag->lag_oaa_size = 0;
+       }
+
+       /* init OST avoid guide bitmap */
+       if (lag->lag_ost_avoid_bitmap) {
+               if (ltds->ltd_tgtnr <= lag->lag_ost_avoid_bitmap->size) {
+                       CFS_RESET_BITMAP(lag->lag_ost_avoid_bitmap);
+               } else {
+                       CFS_FREE_BITMAP(lag->lag_ost_avoid_bitmap);
+                       lag->lag_ost_avoid_bitmap = NULL;
+               }
+       }
+
+       if (!lag->lag_ost_avoid_bitmap) {
+               bitmap = CFS_ALLOCATE_BITMAP(ltds->ltd_tgtnr);
+               if (!bitmap)
+                       return -ENOMEM;
+       }
+
+       if (!lag->lag_oss_avoid_array) {
+               /**
+                * usually there are multiple OSTs in one OSS, but we don't
+                * know the exact OSS number, so we choose a safe option,
+                * using OST count to allocate the array to store the OSS
+                * id.
+                */
+               OBD_ALLOC(new_oss, sizeof(*new_oss) * ltds->ltd_tgtnr);
+               if (!new_oss) {
+                       CFS_FREE_BITMAP(bitmap);
+                       return -ENOMEM;
+               }
+       }
+
+       if (new_oss) {
+               lag->lag_oss_avoid_array = new_oss;
+               lag->lag_oaa_size = ltds->ltd_tgtnr;
+       }
+       if (bitmap)
+               lag->lag_ost_avoid_bitmap = bitmap;
+
+       return 0;
+}
+
+/**
+ * Collect information of used OSTs and OSSs in the overlapped components
+ * of other mirrors
+ */
+void lod_collect_avoidance(struct lod_object *lo, struct lod_avoid_guide *lag,
+                          int comp_idx)
+{
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct lod_layout_component *lod_comp = &lo->ldo_comp_entries[comp_idx];
+       struct cfs_bitmap *bitmap = lag->lag_ost_avoid_bitmap;
+       int i, j;
+
+       /* iterate mirrors */
+       for (i = 0; i < lo->ldo_mirror_count; i++) {
+               struct lod_layout_component *comp;
+
+               /**
+                * skip mirror containing component[comp_idx], we only
+                * collect OSTs info of conflicting component in other mirrors,
+                * so that during read, if OSTs of a mirror's component are
+                * not available, we still have other mirror with different
+                * OSTs to read the data.
+                */
+               comp = &lo->ldo_comp_entries[lo->ldo_mirrors[i].lme_start];
+               if (comp->llc_id == LCME_ID_INVAL ||
+                   mirror_id_of(comp->llc_id) ==
+                                               mirror_id_of(lod_comp->llc_id))
+                       continue;
+
+               /* iterate components of a mirror */
+               lod_foreach_mirror_comp(comp, lo, i) {
+                       /* skip non-overlapped or un-instantiated components */
+                       if (!lu_extent_is_overlapped(&comp->llc_extent,
+                                                    &lod_comp->llc_extent) ||
+                           !lod_comp_inited(comp) || !comp->llc_stripe)
+                               continue;
+
+                       /**
+                        * collect used OSTs index and OSS info from a
+                        * component
+                        */
+                       for (j = 0; j < comp->llc_stripe_count; j++) {
+                               struct lod_tgt_desc *ost;
+                               struct lod_qos_oss *lqo;
+                               int k;
+
+                               ost = OST_TGT(lod, comp->llc_ost_indices[j]);
+                               lqo = ost->ltd_qos.ltq_oss;
+
+                               if (cfs_bitmap_check(bitmap, ost->ltd_index))
+                                       continue;
+
+                               cfs_bitmap_set(bitmap, ost->ltd_index);
+                               lag->lag_ost_avail--;
+
+                               for (k = 0; k < lag->lag_oaa_count; k++) {
+                                       if (lag->lag_oss_avoid_array[k] ==
+                                           lqo->lqo_id)
+                                               break;
+                               }
+                               if (k == lag->lag_oaa_count) {
+                                       lag->lag_oss_avoid_array[k] =
+                                                               lqo->lqo_id;
+                                       lag->lag_oaa_count++;
+                               }
+                       }
+               }
+       }
+}
+
+/**
  * Create a striping for an obejct.
  *
  * The function creates a new striping for the object. The function tries QoS
  * Create a striping for an obejct.
  *
  * The function creates a new striping for the object. The function tries QoS
@@ -2076,10 +2291,12 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
 {
        struct lod_layout_component *lod_comp;
        struct lod_device      *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
 {
        struct lod_layout_component *lod_comp;
        struct lod_device      *d = lu2lod_dev(lod2lu_obj(lo)->lo_dev);
-       struct dt_object      **stripe;
        int                     stripe_len;
        int                     flag = LOV_USES_ASSIGNED_STRIPE;
        int                     i, rc = 0;
        int                     stripe_len;
        int                     flag = LOV_USES_ASSIGNED_STRIPE;
        int                     i, rc = 0;
+       struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid;
+       struct dt_object **stripe = NULL;
+       __u32 *ost_indices = NULL;
        ENTRY;
 
        LASSERT(lo);
        ENTRY;
 
        LASSERT(lo);
@@ -2112,6 +2329,9 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
                OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_len);
                if (stripe == NULL)
                        GOTO(out, rc = -ENOMEM);
                OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_len);
                if (stripe == NULL)
                        GOTO(out, rc = -ENOMEM);
+               OBD_ALLOC(ost_indices, sizeof(*ost_indices) * stripe_len);
+               if (!ost_indices)
+                       GOTO(out, rc = -ENOMEM);
 
                lod_getref(&d->lod_ost_descs);
                /* XXX: support for non-0 files w/o objects */
 
                lod_getref(&d->lod_ost_descs);
                /* XXX: support for non-0 files w/o objects */
@@ -2119,29 +2339,38 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
                                d->lod_desc.ld_tgt_count, stripe_len);
 
                if (lod_comp->llc_ostlist.op_array) {
                                d->lod_desc.ld_tgt_count, stripe_len);
 
                if (lod_comp->llc_ostlist.op_array) {
-                       rc = lod_alloc_ost_list(env, lo, stripe, th, comp_idx,
-                                               inuse);
+                       rc = lod_alloc_ost_list(env, lo, stripe, ost_indices,
+                                               th, comp_idx, inuse);
                } else if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT) {
                } else if (lod_comp->llc_stripe_offset == LOV_OFFSET_DEFAULT) {
-                       rc = lod_alloc_qos(env, lo, stripe, flag, th,
-                                          comp_idx, inuse);
+                       /**
+                        * collect OSTs and OSSs used in other mirrors whose
+                        * components cross the ldo_comp_entries[comp_idx]
+                        */
+                       rc = lod_prepare_avoidance(env, lo);
+                       if (rc)
+                               GOTO(put_ldts, rc);
+
+                       lod_collect_avoidance(lo, lag, comp_idx);
+
+                       rc = lod_alloc_qos(env, lo, stripe, ost_indices, flag,
+                                          th, comp_idx, inuse);
                        if (rc == -EAGAIN)
                        if (rc == -EAGAIN)
-                               rc = lod_alloc_rr(env, lo, stripe, flag, th,
-                                                 comp_idx, inuse);
+                               rc = lod_alloc_rr(env, lo, stripe, ost_indices,
+                                                 flag, th, comp_idx, inuse);
                } else {
                } else {
-                       rc = lod_alloc_specific(env, lo, stripe, flag, th,
-                                               comp_idx, inuse);
+                       rc = lod_alloc_specific(env, lo, stripe, ost_indices,
+                                               flag, th, comp_idx, inuse);
                }
                }
+put_ldts:
                lod_putref(d, &d->lod_ost_descs);
                lod_putref(d, &d->lod_ost_descs);
-
                if (rc < 0) {
                        for (i = 0; i < stripe_len; i++)
                                if (stripe[i] != NULL)
                                        dt_object_put(env, stripe[i]);
                if (rc < 0) {
                        for (i = 0; i < stripe_len; i++)
                                if (stripe[i] != NULL)
                                        dt_object_put(env, stripe[i]);
-
-                       OBD_FREE(stripe, sizeof(stripe[0]) * stripe_len);
                        lod_comp->llc_stripe_count = 0;
                } else {
                        lod_comp->llc_stripe = stripe;
                        lod_comp->llc_stripe_count = 0;
                } else {
                        lod_comp->llc_stripe = stripe;
+                       lod_comp->llc_ost_indices = ost_indices;
                        lod_comp->llc_stripes_allocated = stripe_len;
                }
        } else {
                        lod_comp->llc_stripes_allocated = stripe_len;
                }
        } else {
@@ -2172,6 +2401,13 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
        }
 
 out:
        }
 
 out:
+       if (rc < 0) {
+               if (stripe)
+                       OBD_FREE(stripe, sizeof(stripe[0]) * stripe_len);
+               if (ost_indices)
+                       OBD_FREE(ost_indices,
+                                sizeof(*ost_indices) * stripe_len);
+       }
        RETURN(rc);
 }
 
        RETURN(rc);
 }
 
index 1ac707d..a68c340 100644 (file)
@@ -1868,6 +1868,77 @@ test_46() {
 }
 run_test 46 "Verify setstripe --copy option"
 
 }
 run_test 46 "Verify setstripe --copy option"
 
+test_47() {
+       [ $OSTCOUNT -lt 3 ] && skip "needs >= 3 OSTs" && return
+
+       local file=$DIR/$tdir/$tfile
+       local ids
+       local ost
+       local osts
+
+       test_mkdir $DIR/$tdir
+
+       # test case 1:
+       rm -f $file
+       # mirror1: [comp0]ost0,    [comp1]ost1 and ost2
+       # mirror2: [comp2]    ,    [comp3] should not use ost1 or ost2
+       $LFS mirror create -N -E2m -c1 -o0 --flags=prefer -Eeof -c2 -o1,2 \
+               -N -E2m -c1 -Eeof -c1 $file || error "create FLR $file failed"
+       ids=($($LFS getstripe $file | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+
+       dd if=/dev/zero of=$file bs=1M count=3 || error "dd $file failed"
+       $LFS mirror resync $file || error "resync $file failed"
+
+       ost=$($LFS getstripe -I${ids[2]} $file | awk '/l_ost_idx/{print $5}')
+       if [[ x$ost == "x0," ]]; then
+               $LFS getstripe $file
+               error "component ${ids[2]} objects allocated on $ost " \
+                     "shouldn't on OST0"
+       fi
+
+       ost=$($LFS getstripe -I${ids[3]} $file | awk '/l_ost_idx/{print $5}')
+       if [[ x$ost == "x1," || x$ost == "x2," ]]; then
+               $LFS getstripe $file
+               error "component ${ids[3]} objects allocated on $ost " \
+                     "shouldn't on OST1 or on OST2"
+       fi
+
+       ## test case 2:
+       rm -f $file
+       # mirror1: [comp0]    [comp1]
+       # mirror2: [comp2]    [comp3]
+       # mirror3: [comp4]    [comp5]
+       # mirror4: [comp6]    [comp7]
+       $LFS mirror create -N4 -E1m -c1 -Eeof -c1 $file ||
+               error "create FLR $file failed"
+       ids=($($LFS getstripe $file | awk '/lcme_id/{print $2}' | tr '\n' ' '))
+
+       dd if=/dev/zero of=$file bs=1M count=3 || error "dd $file failed"
+       $LFS mirror resync $file || error "resync $file failed"
+
+       for ((i = 0; i < 6; i++)); do
+               osts[$i]=$($LFS getstripe -I${ids[$i]} $file |
+                       awk '/l_ost_idx/{print $5}')
+       done
+       # comp[0],comp[2],comp[4] should use different osts
+       if [[ ${osts[0]} == ${osts[2]} || ${osts[0]} == ${osts[4]} ||
+             ${osts[2]} == ${osts[4]} ]]; then
+               $LFS getstripe $file
+               error "component ${ids[0]}, ${ids[2]}, ${ids[4]} have objects "\
+                     "allocated on duplicated OSTs"
+       fi
+       # comp[1],comp[3],comp[5] should use different osts
+       if [[ ${osts[1]} == ${osts[3]} || ${osts[1]} == ${osts[5]} ||
+             ${osts[3]} == ${osts[5]} ]]; then
+               $LFS getstripe $file
+               error "component ${ids[1]}, ${ids[3]}, ${ids[5]} have objects "\
+                     "allocated on duplicated OSTs"
+       fi
+
+       return 0
+}
+run_test 47 "Verify mirror obj alloc"
+
 ctrl_file=$(mktemp /tmp/CTRL.XXXXXX)
 lock_file=$(mktemp /var/lock/FLR.XXXXXX)
 
 ctrl_file=$(mktemp /tmp/CTRL.XXXXXX)
 lock_file=$(mktemp /var/lock/FLR.XXXXXX)