X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Flov%2Flov_qos.c;h=b5ba7224e6b9c7f74dc9ef9e509439e442868bb8;hb=7728c26736f4f49400b3143620e15f5ecac9c588;hp=b292fc367868c8d4c5a74efbbab6d413a4d3f1b4;hpb=dbaf6928620fc8ee9e6f6bf6413764571524be8a;p=fs%2Flustre-release.git

diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c
index b292fc3..b5ba722 100644
--- a/lustre/lov/lov_qos.c
+++ b/lustre/lov/lov_qos.c
@@ -47,6 +47,7 @@
 
 #include <obd_class.h>
 #include <obd_lov.h>
+#include <lustre/lustre_idl.h>
 #include "lov_internal.h"
 
 /* #define QOS_DEBUG 1 */
@@ -121,19 +122,16 @@ out:
         RETURN(rc);
 }
 
-int qos_del_tgt(struct obd_device *obd, __u32 index)
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt)
 {
         struct lov_obd *lov = &obd->u.lov;
         struct lov_qos_oss *oss;
         int rc = 0;
         ENTRY;
 
-        if (!lov->lov_tgts[index])
-                RETURN(0);
-
         down_write(&lov->lov_qos.lq_rw_sem);
 
-        oss = lov->lov_tgts[index]->ltd_qos.ltq_oss;
+        oss = tgt->ltd_qos.ltq_oss;
         if (!oss)
                 GOTO(out, rc = -ENOENT);
 
@@ -161,6 +159,7 @@ static int qos_calc_ppo(struct obd_device *obd)
         __u64 ba_max, ba_min, temp;
         __u32 num_active;
         int rc, i, prio_wide;
+        time_t now, age;
         ENTRY;
 
         if (!lov->lov_qos.lq_dirty)
@@ -183,6 +182,7 @@ static int qos_calc_ppo(struct obd_device *obd)
 
         ba_min = (__u64)(-1);
         ba_max = 0;
+        now = cfs_time_current_sec();
         /* Calculate OST penalty per object */
         /* (lov ref taken in alloc_qos) */
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
@@ -205,8 +205,17 @@ static int qos_calc_ppo(struct obd_device *obd)
                 lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj =
                         (temp * prio_wide) >> 8;
 
-                if (lov->lov_qos.lq_reset == 0)
+                age = (now - lov->lov_tgts[i]->ltd_qos.ltq_used) >> 3;
+                if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage)
                         lov->lov_tgts[i]->ltd_qos.ltq_penalty = 0;
+                else if (age > lov->desc.ld_qos_maxage)
+                        /* Decay the penalty by half for every 8x the update
+                         * interval that the device has been idle.  That gives
+                         * lots of time for the statfs information to be
+                         * updated (which the penalty is only a proxy for),
+                         * and avoids penalizing OSS/OSTs under light load. */
+                        lov->lov_tgts[i]->ltd_qos.ltq_penalty >>=
+                                (age / lov->desc.ld_qos_maxage);
         }
 
         num_active = lov->lov_qos.lq_active_oss_count - 1;
@@ -226,8 +235,17 @@ static int qos_calc_ppo(struct obd_device *obd)
                 temp = oss->lqo_bavail >> 1;
                 do_div(temp, oss->lqo_ost_count * num_active);
                 oss->lqo_penalty_per_obj = (temp * prio_wide) >> 8;
-                if (lov->lov_qos.lq_reset == 0)
+
+                age = (now - oss->lqo_used) >> 3;
+                if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage)
                         oss->lqo_penalty = 0;
+                else if (age > lov->desc.ld_qos_maxage)
+                        /* Decay the penalty by half for every 8x the update
+                         * interval that the device has been idle.  That gives
+                         * lots of time for the statfs information to be
+                         * updated (which the penalty is only a proxy for),
+                         * and avoids penalizing OSS/OSTs under light load. */
+                        oss->lqo_penalty >>= (age / lov->desc.ld_qos_maxage);
         }
 
         lov->lov_qos.lq_dirty = 0;
@@ -236,13 +254,10 @@ static int qos_calc_ppo(struct obd_device *obd)
         /* If each ost has almost same free space,
          * do rr allocation for better creation performance */
         lov->lov_qos.lq_same_space = 0;
-        temp = ba_max - ba_min;
-        ba_min = (ba_min * 51) >> 8;     /* 51/256 = .20 */
-        if (temp < ba_min) {
-                /* Difference is less than 20% */
+        if ((ba_max * (256 - lov->lov_qos.lq_threshold_rr)) >> 8 < ba_min) {
                 lov->lov_qos.lq_same_space = 1;
                 /* Reset weights for the next time we enter qos mode */
-                lov->lov_qos.lq_reset = 0;
+                lov->lov_qos.lq_reset = 1;
         }
         rc = 0;
 
@@ -285,6 +300,10 @@ static int qos_used(struct lov_obd *lov, struct ost_pool *osts,
         lov->lov_tgts[index]->ltd_qos.ltq_penalty >>= 1;
         oss->lqo_penalty >>= 1;
 
+        /* mark the OSS and OST as recently used */
+        lov->lov_tgts[index]->ltd_qos.ltq_used =
+                oss->lqo_used = cfs_time_current_sec();
+
         /* Set max penalties for this OST and OSS */
         lov->lov_tgts[index]->ltd_qos.ltq_penalty +=
                 lov->lov_tgts[index]->ltd_qos.ltq_penalty_per_obj *
@@ -490,12 +509,13 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req)
                         continue;
                 /* check if objects has been created on this ost */
                 for (stripe = 0; stripe < lsm->lsm_stripe_count; stripe++) {
+                        /* we try send create to this ost but he is failed */
                         if (stripe == req->rq_stripe)
                                 continue;
+                        /* already have object at this stripe */
                         if (ost_idx == lsm->lsm_oinfo[stripe]->loi_ost_idx)
                                 break;
                 }
-
                 if (stripe >= lsm->lsm_stripe_count) {
                         req->rq_idx = ost_idx;
                         rc = obd_create(lov->lov_tgts[ost_idx]->ltd_exp,
@@ -536,7 +556,7 @@ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt,
                 osts = &(lov->lov_packed);
                 lqr = &(lov->lov_qos.lq_rr);
         } else {
-                read_lock(&pool_tgt_rwlock(pool));
+                down_read(&pool_tgt_rw_sem(pool));
                 osts = &(pool->pool_obds);
                 lqr = &(pool->pool_rr);
         }
@@ -613,8 +633,12 @@ repeat_find:
 
         *stripe_cnt = idx_pos - idx_arr;
 out:
-        if (pool != NULL)
-                read_unlock(&pool_tgt_rwlock(pool));
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lov_pool_putref(pool);
+        }
+
         RETURN(rc);
 }
 
@@ -633,7 +657,7 @@ static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm,
         if (pool == NULL) {
                 osts = &(lov->lov_packed);
         } else {
-                read_lock(&pool_tgt_rwlock(pool));
+                down_read(&pool_tgt_rw_sem(pool));
                 osts = &(pool->pool_obds);
         }
 
@@ -669,9 +693,13 @@ repeat_find:
                 if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
                         continue;
 
-                /* Drop slow OSCs if we can, but not for requested start idx */
+                /* Drop slow OSCs if we can, but not for requested start idx.
+                 *
+                 * This means "if OSC is slow and it is not the requested
+                 * start OST, then it can be skipped, otherwise skip it only
+                 * if it is inactive/recovering/out-of-space." */
                 if ((obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp) > speed) &&
-                    (i != 0 || speed < 2))
+                    (i != 0 || speed >= 2))
                         continue;
 
                 *idx_pos = ost_idx;
@@ -697,8 +725,12 @@ repeat_find:
                lsm->lsm_stripe_count);
         rc = -EFBIG;
 out:
-        if (pool != NULL)
-                read_unlock(&pool_tgt_rwlock(pool));
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lov_pool_putref(pool);
+        }
+
         RETURN(rc);
 }
 
@@ -728,12 +760,17 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
                 osts = &(lov->lov_packed);
                 lqr = &(lov->lov_qos.lq_rr);
         } else {
-                read_lock(&pool_tgt_rwlock(pool));
+                down_read(&pool_tgt_rw_sem(pool));
                 osts = &(pool->pool_obds);
                 lqr = &(pool->pool_rr);
         }
 
-        lov_getref(exp->exp_obd);
+        obd_getref(exp->exp_obd);
+
+        /* wait for fresh statfs info if needed, the rpcs are sent in
+         * lov_create() */
+        qos_statfs_update(exp->exp_obd,
+                          cfs_time_shift_64(-2 * lov->desc.ld_qos_maxage), 1);
 
         /* Detect -EAGAIN early, before expensive lock is taken. */
         if (!lov->lov_qos.lq_dirty && lov->lov_qos.lq_same_space)
@@ -888,13 +925,16 @@ out:
         up_write(&lov->lov_qos.lq_rw_sem);
 
 out_nolock:
-        if (pool != NULL)
-                read_unlock(&pool_tgt_rwlock(pool));
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lov_pool_putref(pool);
+        }
 
         if (rc == -EAGAIN)
                 rc = alloc_rr(lov, idx_arr, stripe_cnt, poolname, flags);
 
-        lov_putref(exp->exp_obd);
+        obd_putref(exp->exp_obd);
         RETURN(rc);
 }
 
@@ -1032,6 +1072,7 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
                 req->rq_stripe = i;
                 /* create data objects with "parent" OA */
                 memcpy(req->rq_oi.oi_oa, src_oa, sizeof(*req->rq_oi.oi_oa));
+                req->rq_oi.oi_cb_up = cb_create_update;
 
                 /* XXX When we start creating objects on demand, we need to
                  *     make sure that we always create the object on the
@@ -1049,6 +1090,10 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
 
         if (stripes < lsm->lsm_stripe_count)
                 qos_shrink_lsm(set);
+        if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LOV_PREP_CREATE)) {
+                qos_shrink_lsm(set);
+                rc = -EIO;
+        }
 
         if (oti && (src_oa->o_valid & OBD_MD_FLCOOKIE)) {
                 oti_alloc_cookies(oti, set->set_count);
@@ -1070,3 +1115,110 @@ void qos_update(struct lov_obd *lov)
         ENTRY;
         lov->lov_qos.lq_dirty = 1;
 }
+
+void qos_statfs_done(struct lov_obd *lov)
+{
+        LASSERT(lov->lov_qos.lq_statfs_in_progress);
+        down_write(&lov->lov_qos.lq_rw_sem);
+        lov->lov_qos.lq_statfs_in_progress = 0;
+        /* wake up any threads waiting for the statfs rpcs to complete */
+        cfs_waitq_signal(&lov->lov_qos.lq_statfs_waitq);
+        up_write(&lov->lov_qos.lq_rw_sem);
+}
+
+static int qos_statfs_ready(struct obd_device *obd, __u64 max_age)
+{
+        struct lov_obd         *lov = &obd->u.lov;
+        int rc;
+        ENTRY;
+        down_read(&lov->lov_qos.lq_rw_sem);
+        rc = lov->lov_qos.lq_statfs_in_progress == 0 ||
+             cfs_time_beforeq_64(max_age, obd->obd_osfs_age);
+        up_read(&lov->lov_qos.lq_rw_sem);
+        RETURN(rc);
+}
+
+/*
+ * Update statfs data if the current osfs age is older than max_age.
+ * If wait is not set, it means that we are called from lov_create()
+ * and we should just issue the rpcs without waiting for them to complete.
+ * If wait is set, we are called from alloc_qos() and we just have
+ * to wait for the request set to complete.
+ */
+void qos_statfs_update(struct obd_device *obd, __u64 max_age, int wait)
+{
+        struct lov_obd         *lov = &obd->u.lov;
+        struct obd_info        *oinfo;
+        int                     rc = 0;
+        struct ptlrpc_request_set *set = NULL;
+        ENTRY;
+
+        if (cfs_time_beforeq_64(max_age, obd->obd_osfs_age))
+                /* statfs data are quite recent, don't need to refresh it */
+                RETURN_EXIT;
+
+        if (!wait && lov->lov_qos.lq_statfs_in_progress)
+                /* statfs already in progress */
+                RETURN_EXIT;
+
+        down_write(&lov->lov_qos.lq_rw_sem);
+        if (lov->lov_qos.lq_statfs_in_progress) {
+                up_write(&lov->lov_qos.lq_rw_sem);
+                GOTO(out, rc = 0);
+        }
+        /* no statfs in flight, send rpcs */
+        lov->lov_qos.lq_statfs_in_progress = 1;
+        up_write(&lov->lov_qos.lq_rw_sem);
+
+        if (wait)
+                CDEBUG(D_QOS, "%s: did not manage to get fresh statfs data "
+                       "in a timely manner (osfs age "LPU64", max age "LPU64")"
+                       ", sending new statfs rpcs\n",
+                       obd_uuid2str(&lov->desc.ld_uuid), obd->obd_osfs_age,
+                       max_age);
+
+        /* need to send statfs rpcs */
+        CDEBUG(D_QOS, "sending new statfs requests\n");
+        memset(lov->lov_qos.lq_statfs_data, 0,
+               sizeof(*lov->lov_qos.lq_statfs_data));
+        oinfo = &lov->lov_qos.lq_statfs_data->lsd_oi;
+        oinfo->oi_osfs = &lov->lov_qos.lq_statfs_data->lsd_statfs;
+        oinfo->oi_flags = OBD_STATFS_NODELAY;
+        set = ptlrpc_prep_set();
+        if (!set)
+                GOTO(out_failed, rc = -ENOMEM);
+
+        rc = obd_statfs_async(obd, oinfo, max_age, set);
+        if (rc || list_empty(&set->set_requests)) {
+                if (rc)
+                        CWARN("statfs failed with %d\n", rc);
+                GOTO(out_failed, rc);
+        }
+        /* send requests via ptlrpcd */
+        oinfo->oi_flags |= OBD_STATFS_PTLRPCD;
+        ptlrpcd_add_rqset(set);
+        GOTO(out, rc);
+
+out_failed:
+        down_write(&lov->lov_qos.lq_rw_sem);
+        lov->lov_qos.lq_statfs_in_progress = 0;
+        /* wake up any threads waiting for the statfs rpcs to complete */
+        cfs_waitq_signal(&lov->lov_qos.lq_statfs_waitq);
+        up_write(&lov->lov_qos.lq_rw_sem);
+        wait = 0;
+out:
+        if (set)
+                ptlrpc_set_destroy(set);
+        if (wait) {
+                struct l_wait_info lwi = { 0 };
+                CDEBUG(D_QOS, "waiting for statfs requests to complete\n");
+                l_wait_event(lov->lov_qos.lq_statfs_waitq,
+                             qos_statfs_ready(obd, max_age), &lwi);
+                if (cfs_time_before_64(obd->obd_osfs_age, max_age))
+                        CDEBUG(D_QOS, "%s: still no fresh statfs data after "
+                                      "waiting (osfs age "LPU64", max age "
+                                      LPU64")\n",
+                                      obd_uuid2str(&lov->desc.ld_uuid),
+                                      obd->obd_osfs_age, max_age);
+        }
+}