Whamcloud - gitweb
OBD_SLAB_ALLOC_PTR_SAFE() is a new OBD_ macro for slab allocation that uses CFS_ALLOC...
[fs/lustre-release.git] / lustre / lov / lov_qos.c
index dd8d526..8913d90 100644 (file)
@@ -121,19 +121,16 @@ out:
         RETURN(rc);
 }
 
-int qos_del_tgt(struct obd_device *obd, __u32 index)
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt)
 {
         struct lov_obd *lov = &obd->u.lov;
         struct lov_qos_oss *oss;
         int rc = 0;
         ENTRY;
 
-        if (!lov->lov_tgts[index])
-                RETURN(0);
-
         down_write(&lov->lov_qos.lq_rw_sem);
 
-        oss = lov->lov_tgts[index]->ltd_qos.ltq_oss;
+        oss = tgt->ltd_qos.ltq_oss;
         if (!oss)
                 GOTO(out, rc = -ENOENT);
 
@@ -161,6 +158,7 @@ static int qos_calc_ppo(struct obd_device *obd)
         __u64 ba_max, ba_min, temp;
         __u32 num_active;
         int rc, i, prio_wide;
+        time_t now, age;
         ENTRY;
 
         if (!lov->lov_qos.lq_dirty)
@@ -183,6 +181,7 @@ static int qos_calc_ppo(struct obd_device *obd)
 
         ba_min = (__u64)(-1);
         ba_max = 0;
+        now = cfs_time_current_sec();
         /* Calculate OST penalty per object */
         /* (lov ref taken in alloc_qos) */
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
@@ -205,8 +204,17 @@ static int qos_calc_ppo(struct obd_device *obd)
                 lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj =
                         (temp * prio_wide) >> 8;
 
-                if (lov->lov_qos.lq_reset == 0)
+                age = (now - lov->lov_tgts[i]->ltd_qos.ltq_used) >> 3;
+                if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage)
                         lov->lov_tgts[i]->ltd_qos.ltq_penalty = 0;
+                else if (age > lov->desc.ld_qos_maxage)
+                        /* Decay the penalty by half for every 8x the update
+                         * interval that the device has been idle.  That gives
+                         * lots of time for the statfs information to be
+                         * updated (which the penalty is only a proxy for),
+                         * and avoids penalizing OSS/OSTs under light load. */
+                        lov->lov_tgts[i]->ltd_qos.ltq_penalty >>=
+                                (age / lov->desc.ld_qos_maxage);
         }
 
         num_active = lov->lov_qos.lq_active_oss_count - 1;
@@ -226,8 +234,17 @@ static int qos_calc_ppo(struct obd_device *obd)
                 temp = oss->lqo_bavail >> 1;
                 do_div(temp, oss->lqo_ost_count * num_active);
                 oss->lqo_penalty_per_obj = (temp * prio_wide) >> 8;
-                if (lov->lov_qos.lq_reset == 0)
+
+                age = (now - oss->lqo_used) >> 3;
+                if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage)
                         oss->lqo_penalty = 0;
+                else if (age > lov->desc.ld_qos_maxage)
+                        /* Decay the penalty by half for every 8x the update
+                         * interval that the device has been idle.  That gives
+                         * lots of time for the statfs information to be
+                         * updated (which the penalty is only a proxy for),
+                         * and avoids penalizing OSS/OSTs under light load. */
+                        oss->lqo_penalty >>= (age / lov->desc.ld_qos_maxage);
         }
 
         lov->lov_qos.lq_dirty = 0;
@@ -236,13 +253,10 @@ static int qos_calc_ppo(struct obd_device *obd)
         /* If each ost has almost same free space,
          * do rr allocation for better creation performance */
         lov->lov_qos.lq_same_space = 0;
-        temp = ba_max - ba_min;
-        ba_min = (ba_min * 51) >> 8;     /* 51/256 = .20 */
-        if (temp < ba_min) {
-                /* Difference is less than 20% */
+        if ((ba_max * (256 - lov->lov_qos.lq_threshold_rr)) >> 8 < ba_min) {
                 lov->lov_qos.lq_same_space = 1;
                 /* Reset weights for the next time we enter qos mode */
-                lov->lov_qos.lq_reset = 0;
+                lov->lov_qos.lq_reset = 1;
         }
         rc = 0;
 
@@ -285,6 +299,10 @@ static int qos_used(struct lov_obd *lov, struct ost_pool *osts,
         lov->lov_tgts[index]->ltd_qos.ltq_penalty >>= 1;
         oss->lqo_penalty >>= 1;
 
+        /* mark the OSS and OST as recently used */
+        lov->lov_tgts[index]->ltd_qos.ltq_used =
+                oss->lqo_used = cfs_time_current_sec();
+
         /* Set max penalties for this OST and OSS */
         lov->lov_tgts[index]->ltd_qos.ltq_penalty +=
                 lov->lov_tgts[index]->ltd_qos.ltq_penalty_per_obj *
@@ -536,7 +554,7 @@ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt,
                 osts = &(lov->lov_packed);
                 lqr = &(lov->lov_qos.lq_rr);
         } else {
-                read_lock(&pool_tgt_rwlock(pool));
+                down_read(&pool_tgt_rw_sem(pool));
                 osts = &(pool->pool_obds);
                 lqr = &(pool->pool_rr);
         }
@@ -613,8 +631,12 @@ repeat_find:
 
         *stripe_cnt = idx_pos - idx_arr;
 out:
-        if (pool != NULL)
-                read_unlock(&pool_tgt_rwlock(pool));
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lov_pool_putref(pool);
+        }
+
         RETURN(rc);
 }
 
@@ -633,7 +655,7 @@ static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm,
         if (pool == NULL) {
                 osts = &(lov->lov_packed);
         } else {
-                read_lock(&pool_tgt_rwlock(pool));
+                down_read(&pool_tgt_rw_sem(pool));
                 osts = &(pool->pool_obds);
         }
 
@@ -669,9 +691,13 @@ repeat_find:
                 if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
                         continue;
 
-                /* Drop slow OSCs if we can, but not for requested start idx */
+                /* Drop slow OSCs if we can, but not for requested start idx.
+                 *
+                 * This means "if OSC is slow and it is not the requested
+                 * start OST, then it can be skipped, otherwise skip it only
+                 * if it is inactive/recovering/out-of-space." */
                 if ((obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp) > speed) &&
-                    (i != 0 || speed < 2))
+                    (i != 0 || speed >= 2))
                         continue;
 
                 *idx_pos = ost_idx;
@@ -697,8 +723,12 @@ repeat_find:
                lsm->lsm_stripe_count);
         rc = -EFBIG;
 out:
-        if (pool != NULL)
-                read_unlock(&pool_tgt_rwlock(pool));
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lov_pool_putref(pool);
+        }
+
         RETURN(rc);
 }
 
@@ -728,7 +758,7 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
                 osts = &(lov->lov_packed);
                 lqr = &(lov->lov_qos.lq_rr);
         } else {
-                read_lock(&pool_tgt_rwlock(pool));
+                down_read(&pool_tgt_rw_sem(pool));
                 osts = &(pool->pool_obds);
                 lqr = &(pool->pool_rr);
         }
@@ -888,8 +918,11 @@ out:
         up_write(&lov->lov_qos.lq_rw_sem);
 
 out_nolock:
-        if (pool != NULL)
-                read_unlock(&pool_tgt_rwlock(pool));
+        if (pool != NULL) {
+                up_read(&pool_tgt_rw_sem(pool));
+                /* put back ref got by lov_find_pool() */
+                lov_pool_putref(pool);
+        }
 
         if (rc == -EAGAIN)
                 rc = alloc_rr(lov, idx_arr, stripe_cnt, poolname, flags);
@@ -965,7 +998,6 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
                         /* Find a small number of stripes we can use
                            (up to # of active osts). */
                         stripes = 1;
-                        lov_getref(exp->exp_obd);
                         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
                                 if (!lov->lov_tgts[i] ||
                                     !lov->lov_tgts[i]->ltd_active)
@@ -975,7 +1007,6 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
                                         break;
                                 stripes++;
                         }
-                        lov_putref(exp->exp_obd);
 
                         if (stripes < stripes_def)
                                 stripes = stripes_def;