RETURN(rc);
}
-int qos_del_tgt(struct obd_device *obd, __u32 index)
+int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt)
{
struct lov_obd *lov = &obd->u.lov;
struct lov_qos_oss *oss;
int rc = 0;
ENTRY;
- if (!lov->lov_tgts[index])
- RETURN(0);
-
down_write(&lov->lov_qos.lq_rw_sem);
- oss = lov->lov_tgts[index]->ltd_qos.ltq_oss;
+ oss = tgt->ltd_qos.ltq_oss;
if (!oss)
GOTO(out, rc = -ENOENT);
__u64 ba_max, ba_min, temp;
__u32 num_active;
int rc, i, prio_wide;
+ time_t now, age;
ENTRY;
if (!lov->lov_qos.lq_dirty)
ba_min = (__u64)(-1);
ba_max = 0;
+ now = cfs_time_current_sec();
/* Calculate OST penalty per object */
/* (lov ref taken in alloc_qos) */
for (i = 0; i < lov->desc.ld_tgt_count; i++) {
lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj =
(temp * prio_wide) >> 8;
- if (lov->lov_qos.lq_reset == 0)
+ age = (now - lov->lov_tgts[i]->ltd_qos.ltq_used) >> 3;
+ if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage)
lov->lov_tgts[i]->ltd_qos.ltq_penalty = 0;
+ else if (age > lov->desc.ld_qos_maxage)
+ /* Decay the penalty by half for every 8x the update
+ * interval that the device has been idle. That gives
+ * lots of time for the statfs information to be
+ * updated (which the penalty is only a proxy for),
+ * and avoids penalizing OSS/OSTs under light load. */
+ lov->lov_tgts[i]->ltd_qos.ltq_penalty >>=
+ (age / lov->desc.ld_qos_maxage);
}
num_active = lov->lov_qos.lq_active_oss_count - 1;
temp = oss->lqo_bavail >> 1;
do_div(temp, oss->lqo_ost_count * num_active);
oss->lqo_penalty_per_obj = (temp * prio_wide) >> 8;
- if (lov->lov_qos.lq_reset == 0)
+
+ age = (now - oss->lqo_used) >> 3;
+ if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage)
oss->lqo_penalty = 0;
+ else if (age > lov->desc.ld_qos_maxage)
+ /* Decay the penalty by half for every 8x the update
+ * interval that the device has been idle. That gives
+ * lots of time for the statfs information to be
+ * updated (which the penalty is only a proxy for),
+ * and avoids penalizing OSS/OSTs under light load. */
+ oss->lqo_penalty >>= (age / lov->desc.ld_qos_maxage);
}
lov->lov_qos.lq_dirty = 0;
/* If each ost has almost same free space,
* do rr allocation for better creation performance */
lov->lov_qos.lq_same_space = 0;
- temp = ba_max - ba_min;
- ba_min = (ba_min * 51) >> 8; /* 51/256 = .20 */
- if (temp < ba_min) {
- /* Difference is less than 20% */
+ if ((ba_max * (256 - lov->lov_qos.lq_threshold_rr)) >> 8 < ba_min) {
lov->lov_qos.lq_same_space = 1;
/* Reset weights for the next time we enter qos mode */
- lov->lov_qos.lq_reset = 0;
+ lov->lov_qos.lq_reset = 1;
}
rc = 0;
lov->lov_tgts[index]->ltd_qos.ltq_penalty >>= 1;
oss->lqo_penalty >>= 1;
+ /* mark the OSS and OST as recently used */
+ lov->lov_tgts[index]->ltd_qos.ltq_used =
+ oss->lqo_used = cfs_time_current_sec();
+
/* Set max penalties for this OST and OSS */
lov->lov_tgts[index]->ltd_qos.ltq_penalty +=
lov->lov_tgts[index]->ltd_qos.ltq_penalty_per_obj *
osts = &(lov->lov_packed);
lqr = &(lov->lov_qos.lq_rr);
} else {
- read_lock(&pool_tgt_rwlock(pool));
+ down_read(&pool_tgt_rw_sem(pool));
osts = &(pool->pool_obds);
lqr = &(pool->pool_rr);
}
*stripe_cnt = idx_pos - idx_arr;
out:
- if (pool != NULL)
- read_unlock(&pool_tgt_rwlock(pool));
+ if (pool != NULL) {
+ up_read(&pool_tgt_rw_sem(pool));
+ /* put back ref got by lov_find_pool() */
+ lov_pool_putref(pool);
+ }
+
RETURN(rc);
}
if (pool == NULL) {
osts = &(lov->lov_packed);
} else {
- read_lock(&pool_tgt_rwlock(pool));
+ down_read(&pool_tgt_rw_sem(pool));
osts = &(pool->pool_obds);
}
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
continue;
- /* Drop slow OSCs if we can, but not for requested start idx */
+ /* Drop slow OSCs if we can, but not for requested start idx.
+ *
+ * This means "if OSC is slow and it is not the requested
+ * start OST, then it can be skipped, otherwise skip it only
+ * if it is inactive/recovering/out-of-space." */
if ((obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp) > speed) &&
- (i != 0 || speed < 2))
+ (i != 0 || speed >= 2))
continue;
*idx_pos = ost_idx;
lsm->lsm_stripe_count);
rc = -EFBIG;
out:
- if (pool != NULL)
- read_unlock(&pool_tgt_rwlock(pool));
+ if (pool != NULL) {
+ up_read(&pool_tgt_rw_sem(pool));
+ /* put back ref got by lov_find_pool() */
+ lov_pool_putref(pool);
+ }
+
RETURN(rc);
}
osts = &(lov->lov_packed);
lqr = &(lov->lov_qos.lq_rr);
} else {
- read_lock(&pool_tgt_rwlock(pool));
+ down_read(&pool_tgt_rw_sem(pool));
osts = &(pool->pool_obds);
lqr = &(pool->pool_rr);
}
up_write(&lov->lov_qos.lq_rw_sem);
out_nolock:
- if (pool != NULL)
- read_unlock(&pool_tgt_rwlock(pool));
+ if (pool != NULL) {
+ up_read(&pool_tgt_rw_sem(pool));
+ /* put back ref got by lov_find_pool() */
+ lov_pool_putref(pool);
+ }
if (rc == -EAGAIN)
rc = alloc_rr(lov, idx_arr, stripe_cnt, poolname, flags);
/* Find a small number of stripes we can use
(up to # of active osts). */
stripes = 1;
- lov_getref(exp->exp_obd);
for (i = 0; i < lov->desc.ld_tgt_count; i++) {
if (!lov->lov_tgts[i] ||
!lov->lov_tgts[i]->ltd_active)
break;
stripes++;
}
- lov_putref(exp->exp_obd);
if (stripes < stripes_def)
stripes = stripes_def;