Whamcloud - gitweb
LU-8575 lod: clear ost usable flag to avoid striping.
[fs/lustre-release.git] / lustre / lod / lod_qos.c
index 4bd0906..1288bab 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright  2009 Sun Microsystems, Inc. All rights reserved
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
 
 #include <asm/div64.h>
 #include <libcfs/libcfs.h>
-#include <obd_class.h>
 #include <lustre/lustre_idl.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
+
 #include "lod_internal.h"
 
 /*
@@ -197,6 +199,11 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
        LASSERT(ost);
 
        rc = dt_statfs(env, ost->ltd_ost, sfs);
+
+       if (rc == 0 && ((sfs->os_state & OS_STATE_ENOSPC) ||
+           (sfs->os_state & OS_STATE_ENOINO && sfs->os_fprecreated == 0)))
+               RETURN(-ENOSPC);
+
        if (rc && rc != -ENOTCONN)
                CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc);
 
@@ -207,9 +214,12 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
        /* check whether device has changed state (active, inactive) */
        if (rc != 0 && ost->ltd_active) {
                /* turned inactive? */
-               spin_lock(&d->lod_desc_lock);
+               spin_lock(&d->lod_lock);
                if (ost->ltd_active) {
                        ost->ltd_active = 0;
+                       if (rc == -ENOTCONN)
+                               ost->ltd_connecting = 1;
+
                        LASSERT(d->lod_desc.ld_active_tgt_count > 0);
                        d->lod_desc.ld_active_tgt_count--;
                        d->lod_qos.lq_dirty = 1;
@@ -217,22 +227,23 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
                        CDEBUG(D_CONFIG, "%s: turns inactive\n",
                               ost->ltd_exp->exp_obd->obd_name);
                }
-               spin_unlock(&d->lod_desc_lock);
+               spin_unlock(&d->lod_lock);
        } else if (rc == 0 && ost->ltd_active == 0) {
                /* turned active? */
                LASSERTF(d->lod_desc.ld_active_tgt_count < d->lod_ostnr,
                         "active tgt count %d, ost nr %d\n",
                         d->lod_desc.ld_active_tgt_count, d->lod_ostnr);
-               spin_lock(&d->lod_desc_lock);
+               spin_lock(&d->lod_lock);
                if (ost->ltd_active == 0) {
                        ost->ltd_active = 1;
+                       ost->ltd_connecting = 0;
                        d->lod_desc.ld_active_tgt_count++;
                        d->lod_qos.lq_dirty = 1;
                        d->lod_qos.lq_rr.lqr_dirty = 1;
                        CDEBUG(D_CONFIG, "%s: turns active\n",
                               ost->ltd_exp->exp_obd->obd_name);
                }
-               spin_unlock(&d->lod_desc_lock);
+               spin_unlock(&d->lod_lock);
        }
 
        RETURN(rc);
@@ -511,9 +522,9 @@ static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts,
                if (ost->ltd_qos.ltq_usable)
                        *total_wt += ost->ltd_qos.ltq_weight;
 
-               QOS_DEBUG("recalc tgt %d usable=%d avail="LPU64
-                         " ostppo="LPU64" ostp="LPU64" ossppo="LPU64
-                         " ossp="LPU64" wt="LPU64"\n",
+               QOS_DEBUG("recalc tgt %d usable=%d avail=%llu"
+                         " ostppo=%llu ostp=%llu ossppo=%llu"
+                         " ossp=%llu wt=%llu\n",
                          i, ost->ltd_qos.ltq_usable, TGT_BAVAIL(i) >> 10,
                          ost->ltd_qos.ltq_penalty_per_obj >> 10,
                          ost->ltd_qos.ltq_penalty >> 10,
@@ -736,30 +747,6 @@ static int min_stripe_count(__u32 stripe_cnt, int flags)
 #define LOV_CREATE_RESEED_MIN  2000
 
 /**
- * Check if an OST is full.
- *
- * Check whether an OST should be considered full based
- * on the given statfs data.
- *
- * \param[in] msfs     statfs data
- *
- * \retval false       not full
- * \retval true                full
- */
-static int inline lod_qos_dev_is_full(struct obd_statfs *msfs)
-{
-       __u64 used;
-       int   bs = msfs->os_bsize;
-
-       LASSERT(((bs - 1) & bs) == 0);
-
-       /* the minimum of 0.1% used blocks and 1GB bytes. */
-       used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
-                       1 << (31 - ffs(bs)));
-       return (msfs->os_bavail < used);
-}
-
-/**
  * Initialize temporary OST-in-use array.
  *
  * Allocate or extend the array used to mark targets already assigned to a new
@@ -850,14 +837,6 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
        }
 
        /*
-        * skip full devices
-        */
-       if (lod_qos_dev_is_full(sfs)) {
-               QOS_DEBUG("#%d is full\n", ost_idx);
-               goto out_return;
-       }
-
-       /*
         * We expect number of precreated objects in f_ffree at
         * the first iteration, skip OSPs with no objects ready
         */
@@ -877,7 +856,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
        /*
         * do not put >1 objects on a single OST
         */
-       if (speed && lod_qos_is_ost_used(env, ost_idx, stripe_idx))
+       if (lod_qos_is_ost_used(env, ost_idx, stripe_idx))
                goto out_return;
 
        o = lod_qos_declare_object_on(env, m, ost_idx, th);
@@ -893,6 +872,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env,
         */
        lod_qos_ost_in_use(env, stripe_idx, ost_idx);
        stripe[stripe_idx] = o;
+       OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LOV_CREATE_RACE, 2);
        stripe_idx++;
        *s_idx = stripe_idx;
 
@@ -937,6 +917,7 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
        int                rc;
        __u32              ost_start_idx_temp;
        int                speed = 0;
+       int                ost_connecting = 0;
        __u32              stripe_idx = 0;
        __u32              stripe_cnt = lo->ldo_stripenr;
        __u32              stripe_cnt_min = min_stripe_count(stripe_cnt, flags);
@@ -1012,11 +993,16 @@ repeat_find:
                rc = lod_check_and_reserve_ost(env, m, sfs, ost_idx, speed,
                                               &stripe_idx, stripe, th);
                spin_lock(&lqr->lqr_alloc);
+
+               if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting)
+                       ost_connecting = 1;
        }
        if ((speed < 2) && (stripe_idx < stripe_cnt_min)) {
                /* Try again, allowing slower OSCs */
                speed++;
                lqr->lqr_start_idx = ost_start_idx_temp;
+
+               ost_connecting = 0;
                goto repeat_find;
        }
 
@@ -1029,7 +1015,10 @@ repeat_find:
                rc = 0;
        } else {
                /* nobody provided us with a single object */
-               rc = -ENOSPC;
+               if (ost_connecting)
+                       rc = -EINPROGRESS;
+               else
+                       rc = -ENOSPC;
        }
 
 out:
@@ -1090,7 +1079,7 @@ static int lod_alloc_ost_list(const struct lu_env *env,
 
        v3 = (struct lov_user_md_v3 *)lum;
        for (i = 0; i < lo->ldo_stripenr; i++) {
-               if (v3->lmm_objects[i].l_ost_idx == lo->ldo_def_stripe_offset) {
+               if (v3->lmm_objects[i].l_ost_idx == lo->ldo_stripe_offset) {
                        array_idx = i;
                        break;
                }
@@ -1098,7 +1087,7 @@ static int lod_alloc_ost_list(const struct lu_env *env,
        if (i == lo->ldo_stripenr) {
                CDEBUG(D_OTHER,
                       "%s: start index %d not in the specified list of OSTs\n",
-                      lod2obd(m)->obd_name, lo->ldo_def_stripe_offset);
+                      lod2obd(m)->obd_name, lo->ldo_stripe_offset);
                RETURN(-EINVAL);
        }
 
@@ -1146,7 +1135,7 @@ static int lod_alloc_ost_list(const struct lu_env *env,
 /**
  * Allocate a striping on a predefined set of OSTs.
  *
- * Allocates new layout starting from OST index in lo->ldo_def_stripe_offset.
+ * Allocates new layout starting from OST index in lo->ldo_stripe_offset.
  * Full OSTs are not considered. The exact order of OSTs is not important and
  * varies depending on OST status. The allocation procedure prefers the targets
  * with precreated objects ready. The number of stripes needed and stripe
@@ -1202,15 +1191,14 @@ repeat_find:
        /* search loi_ost_idx in ost array */
        array_idx = 0;
        for (i = 0; i < ost_count; i++) {
-               if (osts->op_array[i] == lo->ldo_def_stripe_offset) {
+               if (osts->op_array[i] == lo->ldo_stripe_offset) {
                        array_idx = i;
                        break;
                }
        }
        if (i == ost_count) {
                CERROR("Start index %d not found in pool '%s'\n",
-                      lo->ldo_def_stripe_offset,
-                      lo->ldo_pool ? lo->ldo_pool : "");
+                      lo->ldo_stripe_offset, lo->ldo_pool ?: "");
                GOTO(out, rc = -EINVAL);
        }
 
@@ -1361,18 +1349,18 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                         struct dt_object **stripe, int flags,
                         struct thandle *th)
 {
-       struct lod_device   *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
-       struct obd_statfs   *sfs = &lod_env_info(env)->lti_osfs;
+       struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
+       struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
        struct lod_tgt_desc *ost;
-       struct dt_object    *o;
-       __u64                total_weight = 0;
-       unsigned int         i;
-       int                  rc = 0;
-       __u32                nfound, good_osts;
-       __u32                stripe_cnt = lo->ldo_stripenr;
-       __u32                stripe_cnt_min;
-       struct pool_desc    *pool = NULL;
-       struct ost_pool    *osts;
+       struct dt_object *o;
+       __u64 total_weight = 0;
+       __u32 nfound, good_osts;
+       __u32 stripe_cnt = lo->ldo_stripenr;
+       __u32 stripe_cnt_min;
+       struct pool_desc *pool = NULL;
+       struct ost_pool *osts;
+       unsigned int i;
+       int rc = 0;
        ENTRY;
 
        stripe_cnt_min = min_stripe_count(stripe_cnt, flags);
@@ -1380,30 +1368,30 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                RETURN(-EINVAL);
 
        if (lo->ldo_pool)
-               pool = lod_find_pool(m, lo->ldo_pool);
+               pool = lod_find_pool(lod, lo->ldo_pool);
 
        if (pool != NULL) {
                down_read(&pool_tgt_rw_sem(pool));
                osts = &(pool->pool_obds);
        } else {
-               osts = &(m->lod_pool_info);
+               osts = &(lod->lod_pool_info);
        }
 
        /* Detect -EAGAIN early, before expensive lock is taken. */
-       if (!lod_qos_is_usable(m))
+       if (!lod_qos_is_usable(lod))
                GOTO(out_nolock, rc = -EAGAIN);
 
        /* Do actual allocation, use write lock here. */
-       down_write(&m->lod_qos.lq_rw_sem);
+       down_write(&lod->lod_qos.lq_rw_sem);
 
        /*
         * Check again, while we were sleeping on @lq_rw_sem things could
         * change.
         */
-       if (!lod_qos_is_usable(m))
+       if (!lod_qos_is_usable(lod))
                GOTO(out, rc = -EAGAIN);
 
-       rc = lod_qos_calc_ppo(m);
+       rc = lod_qos_calc_ppo(lod);
        if (rc)
                GOTO(out, rc);
 
@@ -1414,19 +1402,19 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
        good_osts = 0;
        /* Find all the OSTs that are valid stripe candidates */
        for (i = 0; i < osts->op_count; i++) {
-               if (!cfs_bitmap_check(m->lod_ost_bitmap, osts->op_array[i]))
+               if (!cfs_bitmap_check(lod->lod_ost_bitmap, osts->op_array[i]))
                        continue;
 
-               rc = lod_statfs_and_check(env, m, osts->op_array[i], sfs);
+               ost = OST_TGT(lod, osts->op_array[i]);
+               ost->ltd_qos.ltq_usable = 0;
+
+               rc = lod_statfs_and_check(env, lod, osts->op_array[i], sfs);
                if (rc) {
                        /* this OSP doesn't feel well */
                        continue;
                }
 
-               /*
-                * skip full devices
-                */
-               if (lod_qos_dev_is_full(sfs))
+               if (sfs->os_state & OS_STATE_DEGRADED)
                        continue;
 
                /* Fail Check before osc_precreate() is called
@@ -1435,9 +1423,8 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                                   osts->op_array[i] == 0)
                        continue;
 
-               ost = OST_TGT(m,osts->op_array[i]);
                ost->ltd_qos.ltq_usable = 1;
-               lod_qos_calc_weight(m, osts->op_array[i]);
+               lod_qos_calc_weight(lod, osts->op_array[i]);
                total_weight += ost->ltd_qos.ltq_weight;
 
                good_osts++;
@@ -1490,17 +1477,17 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                for (i = 0; i < osts->op_count; i++) {
                        __u32 idx = osts->op_array[i];
 
-                       if (!cfs_bitmap_check(m->lod_ost_bitmap, idx))
+                       if (!cfs_bitmap_check(lod->lod_ost_bitmap, idx))
                                continue;
 
-                       ost = OST_TGT(m,idx);
+                       ost = OST_TGT(lod, idx);
 
                        if (!ost->ltd_qos.ltq_usable)
                                continue;
 
                        cur_weight += ost->ltd_qos.ltq_weight;
-                       QOS_DEBUG("stripe_cnt=%d nfound=%d cur_weight="LPU64
-                                 " rand="LPU64" total_weight="LPU64"\n",
+                       QOS_DEBUG("stripe_cnt=%d nfound=%d cur_weight=%llu"
+                                 " rand=%llu total_weight=%llu\n",
                                  stripe_cnt, nfound, cur_weight, rand,
                                  total_weight);
 
@@ -1516,14 +1503,14 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                                continue;
                        lod_qos_ost_in_use(env, nfound, idx);
 
-                       o = lod_qos_declare_object_on(env, m, idx, th);
+                       o = lod_qos_declare_object_on(env, lod, idx, th);
                        if (IS_ERR(o)) {
                                QOS_DEBUG("can't declare object on #%u: %d\n",
                                          idx, (int) PTR_ERR(o));
                                continue;
                        }
                        stripe[nfound++] = o;
-                       lod_qos_used(m, osts, idx, &total_weight);
+                       lod_qos_used(lod, osts, idx, &total_weight);
                        rc = 0;
                        break;
                }
@@ -1542,7 +1529,8 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                 * so it's possible OSP won't be able to provide us with
                 * an object due to just changed state
                 */
-               LCONSOLE_INFO("wanted %d, found %d\n", stripe_cnt, nfound);
+               QOS_DEBUG("%s: wanted %d objects, found only %d\n",
+                         lod2obd(lod)->obd_name, stripe_cnt, nfound);
                for (i = 0; i < nfound; i++) {
                        LASSERT(stripe[i] != NULL);
                        lu_object_put(env, &stripe[i]->do_lu);
@@ -1550,14 +1538,14 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
                }
 
                /* makes sense to rebalance next time */
-               m->lod_qos.lq_dirty = 1;
-               m->lod_qos.lq_same_space = 0;
+               lod->lod_qos.lq_dirty = 1;
+               lod->lod_qos.lq_same_space = 0;
 
                rc = -EAGAIN;
        }
 
 out:
-       up_write(&m->lod_qos.lq_rw_sem);
+       up_write(&lod->lod_qos.lq_rw_sem);
 
 out_nolock:
        if (pool != NULL) {
@@ -1770,7 +1758,7 @@ static int lod_qos_parse_config(const struct lu_env *env,
        if (v1->lmm_stripe_count > 0)
                lo->ldo_stripenr = v1->lmm_stripe_count;
 
-       lo->ldo_def_stripe_offset = v1->lmm_stripe_offset;
+       lo->ldo_stripe_offset = v1->lmm_stripe_offset;
 
        lod_object_set_pool(lo, NULL);
        if (pool_name != NULL) {
@@ -1781,14 +1769,14 @@ static int lod_qos_parse_config(const struct lu_env *env,
                /* coverity[overrun-buffer-val] */
                pool = lod_find_pool(d, pool_name);
                if (pool != NULL) {
-                       if (lo->ldo_def_stripe_offset != LOV_OFFSET_DEFAULT) {
+                       if (lo->ldo_stripe_offset != LOV_OFFSET_DEFAULT) {
                                rc = lod_check_index_in_pool(
-                                              lo->ldo_def_stripe_offset, pool);
+                                               lo->ldo_stripe_offset, pool);
                                if (rc < 0) {
                                        lod_pool_putref(pool);
                                        CERROR("%s: invalid offset, %u\n",
                                               lod2obd(d)->obd_name,
-                                              lo->ldo_def_stripe_offset);
+                                              lo->ldo_stripe_offset);
                                        RETURN(-EINVAL);
                                }
                        }
@@ -1895,7 +1883,7 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
 
                if (lum != NULL && lum->lmm_magic == LOV_USER_MAGIC_SPECIFIC) {
                        rc = lod_alloc_ost_list(env, lo, stripe, lum, th);
-               } else if (lo->ldo_def_stripe_offset == LOV_OFFSET_DEFAULT) {
+               } else if (lo->ldo_stripe_offset == LOV_OFFSET_DEFAULT) {
                        rc = lod_alloc_qos(env, lo, stripe, flag, th);
                        if (rc == -EAGAIN)
                                rc = lod_alloc_rr(env, lo, stripe, flag, th);