* Copyright 2009 Sun Microsystems, Inc. All rights reserved
* Use is subject to license terms.
*
- * Copyright (c) 2011, 2012, Whamcloud, Inc.
+ * Copyright (c) 2012, Intel Corporation.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
#define TGT_BAVAIL(i) (OST_TGT(lod,i)->ltd_statfs.os_bavail * \
OST_TGT(lod,i)->ltd_statfs.os_bsize)
-int qos_add_tgt(struct lod_device *lod, struct lod_ost_desc *ost_desc)
+int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
{
struct lov_qos_oss *oss = NULL, *temposs;
struct obd_export *exp = ost_desc->ltd_exp;
cfs_list_t *list;
ENTRY;
- cfs_down_write(&lod->lod_qos.lq_rw_sem);
+ down_write(&lod->lod_qos.lq_rw_sem);
/*
* a bit hacky approach to learn NID of corresponding connection
* but there is no official API to access information like this
lod->lod_qos.lq_rr.lqr_dirty = 1;
out:
- cfs_up_write(&lod->lod_qos.lq_rw_sem);
+ up_write(&lod->lod_qos.lq_rw_sem);
RETURN(rc);
}
-int qos_del_tgt(struct lod_device *lod, struct lod_ost_desc *ost_desc)
+int qos_del_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
{
struct lov_qos_oss *oss;
int rc = 0;
ENTRY;
- cfs_down_write(&lod->lod_qos.lq_rw_sem);
+ down_write(&lod->lod_qos.lq_rw_sem);
oss = ost_desc->ltd_qos.ltq_oss;
if (!oss)
GOTO(out, rc = -ENOENT);
lod->lod_qos.lq_dirty = 1;
lod->lod_qos.lq_rr.lqr_dirty = 1;
out:
- cfs_up_write(&lod->lod_qos.lq_rw_sem);
+ up_write(&lod->lod_qos.lq_rw_sem);
RETURN(rc);
}
static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
int index, struct obd_statfs *sfs)
{
- struct lod_ost_desc *ost;
+ struct lod_tgt_desc *ost;
int rc;
LASSERT(d);
LASSERT(ost);
rc = dt_statfs(env, ost->ltd_ost, sfs);
- if (rc)
- return rc;
+ if (rc && rc != -ENOTCONN)
+ CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc);
/* check whether device has changed state (active, inactive) */
- if (unlikely(sfs->os_blocks == 0 && ost->ltd_active)) {
+ if (rc != 0 && ost->ltd_active) {
/* turned inactive? */
- cfs_spin_lock(&d->lod_desc_lock);
- if (sfs->os_blocks == 0 && ost->ltd_active) {
+ spin_lock(&d->lod_desc_lock);
+ if (ost->ltd_active) {
ost->ltd_active = 0;
LASSERT(d->lod_desc.ld_active_tgt_count > 0);
d->lod_desc.ld_active_tgt_count--;
CDEBUG(D_CONFIG, "%s: turns inactive\n",
ost->ltd_exp->exp_obd->obd_name);
}
- cfs_spin_unlock(&d->lod_desc_lock);
- } else if (unlikely(sfs->os_blocks && ost->ltd_active == 0)) {
+ spin_unlock(&d->lod_desc_lock);
+ } else if (rc == 0 && ost->ltd_active == 0) {
/* turned active? */
LASSERT(d->lod_desc.ld_active_tgt_count < d->lod_ostnr);
- cfs_spin_lock(&d->lod_desc_lock);
- if (sfs->os_blocks && ost->ltd_active == 0) {
+ spin_lock(&d->lod_desc_lock);
+ if (ost->ltd_active == 0) {
ost->ltd_active = 1;
d->lod_desc.ld_active_tgt_count++;
d->lod_qos.lq_dirty = 1;
CDEBUG(D_CONFIG, "%s: turns active\n",
ost->ltd_exp->exp_obd->obd_name);
}
- cfs_spin_unlock(&d->lod_desc_lock);
+ spin_unlock(&d->lod_desc_lock);
}
return rc;
}
-/*
- * Update statfs data if the current osfs age is older than max_age.
- * If wait is not set, it means that we are called from lov_create()
- * and we should just issue the rpcs without waiting for them to complete.
- * If wait is set, we are called from alloc_qos() and we just have
- * to wait for the request set to complete.
- */
static void lod_qos_statfs_update(const struct lu_env *env,
struct lod_device *lod)
{
__u64 max_age, avail;
ENTRY;
- max_age = cfs_time_shift_64(-2*lod->lod_desc.ld_qos_maxage);
+ max_age = cfs_time_shift_64(-2 * lod->lod_desc.ld_qos_maxage);
if (cfs_time_beforeq_64(max_age, obd->obd_osfs_age))
/* statfs data are quite recent, don't need to refresh it */
RETURN_EXIT;
- cfs_down_write(&lod->lod_qos.lq_rw_sem);
+ down_write(&lod->lod_qos.lq_rw_sem);
if (cfs_time_beforeq_64(max_age, obd->obd_osfs_age))
GOTO(out, rc = 0);
avail = OST_TGT(lod,idx)->ltd_statfs.os_bavail;
rc = lod_statfs_and_check(env, lod, idx,
&OST_TGT(lod,idx)->ltd_statfs);
- if (rc) {
- /* XXX: disable this OST till next refresh? */
- CERROR("can't refresh statfs: %d\n", rc);
+ if (rc)
break;
- }
if (OST_TGT(lod,idx)->ltd_statfs.os_bavail != avail)
/* recalculate weigths */
lod->lod_qos.lq_dirty = 1;
obd->obd_osfs_age = cfs_time_current_64();
out:
- cfs_up_write(&lod->lod_qos.lq_rw_sem);
+ up_write(&lod->lod_qos.lq_rw_sem);
}
/* Recalculate per-object penalties for OSSs and OSTs,
oss->lqo_bavail = 0;
lod->lod_qos.lq_active_oss_count = 0;
- /* How badly user wants to select osts "widely" (not recently chosen
- and not on recent oss's). As opposed to "freely" (free space
- avail.) 0-256. */
+ /*
+ * How badly user wants to select OSTs "widely" (not recently chosen
+ * and not on recent OSS's). As opposed to "freely" (free space
+ * avail.) 0-256
+ */
prio_wide = 256 - lod->lod_qos.lq_prio_free;
ba_min = (__u64)(-1);
age = (now - OST_TGT(lod,i)->ltd_qos.ltq_used) >> 3;
if (lod->lod_qos.lq_reset ||
- age > 32 * lod->lod_desc.ld_qos_maxage)
+ age > 32 * lod->lod_desc.ld_qos_maxage)
OST_TGT(lod,i)->ltd_qos.ltq_penalty = 0;
else if (age > lod->lod_desc.ld_qos_maxage)
/* Decay the penalty by half for every 8x the update
static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts,
__u32 index, __u64 *total_wt)
{
- struct lod_ost_desc *ost;
+ struct lod_tgt_desc *ost;
struct lov_qos_oss *oss;
int j;
ENTRY;
struct lov_qos_rr *lqr)
{
struct lov_qos_oss *oss;
- struct lod_ost_desc *ost;
+ struct lod_tgt_desc *ost;
unsigned placed, real_count;
int i, rc;
ENTRY;
}
/* Do actual allocation. */
- cfs_down_write(&lod->lod_qos.lq_rw_sem);
+ down_write(&lod->lod_qos.lq_rw_sem);
/*
* Check again. While we were sleeping on @lq_rw_sem something could
*/
if (!lqr->lqr_dirty) {
LASSERT(lqr->lqr_pool.op_size);
- cfs_up_write(&lod->lod_qos.lq_rw_sem);
+ up_write(&lod->lod_qos.lq_rw_sem);
RETURN(0);
}
lqr->lqr_pool.op_count = real_count;
rc = lod_ost_pool_extend(&lqr->lqr_pool, real_count);
if (rc) {
- cfs_up_write(&lod->lod_qos.lq_rw_sem);
+ up_write(&lod->lod_qos.lq_rw_sem);
RETURN(rc);
}
for (i = 0; i < lqr->lqr_pool.op_count; i++)
}
lqr->lqr_dirty = 0;
- cfs_up_write(&lod->lod_qos.lq_rw_sem);
+ up_write(&lod->lod_qos.lq_rw_sem);
if (placed != real_count) {
/* This should never happen */
int ost_idx,
struct thandle *th)
{
- struct lod_ost_desc *ost;
+ struct lod_tgt_desc *ost;
struct lu_object *o, *n;
struct lu_device *nd;
struct dt_object *dt;
/* the minimum of 0.1% used blocks and 1GB bytes. */
used = min_t(__u64, (msfs->os_blocks - msfs->os_bfree) >> 10,
- 1 << (31 - cfs_ffs(bs)));
+ 1 << (31 - ffs(bs)));
return (msfs->os_bavail < used);
}
return 0;
}
-/* Allocate objects on osts with round-robin algorithm */
+/* Allocate objects on OSTs with round-robin algorithm */
static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
int flags, struct thandle *th)
{
pool = lod_find_pool(m, lo->ldo_pool);
if (pool != NULL) {
- cfs_down_read(&pool_tgt_rw_sem(pool));
+ down_read(&pool_tgt_rw_sem(pool));
osts = &(pool->pool_obds);
lqr = &(pool->pool_rr);
} else {
if (rc)
GOTO(out, rc);
+ rc = lod_qos_ost_in_use_clear(env, lo->ldo_stripenr);
+ if (rc)
+ GOTO(out, rc);
+
if (--lqr->lqr_start_count <= 0) {
lqr->lqr_start_idx = cfs_rand() % osts->op_count;
lqr->lqr_start_count =
if (stripe_cnt > 1 && (osts->op_count % stripe_cnt) != 1)
++lqr->lqr_offset_idx;
}
- cfs_down_read(&m->lod_qos.lq_rw_sem);
+ down_read(&m->lod_qos.lq_rw_sem);
ost_start_idx_temp = lqr->lqr_start_idx;
repeat_find:
lqr->lqr_offset_idx, osts->op_count, osts->op_count,
array_idx);
- for (i = 0; i < osts->op_count;
- i++, array_idx = (array_idx + 1) % osts->op_count) {
+ for (i = 0; i < osts->op_count && stripe_idx < lo->ldo_stripenr;
+ i++, array_idx = (array_idx + 1) % osts->op_count) {
++lqr->lqr_start_idx;
ost_idx = lqr->lqr_pool.op_array[array_idx];
stripe_idx, array_idx, ost_idx);
if ((ost_idx == LOV_QOS_EMPTY) ||
- !cfs_bitmap_check(m->lod_ost_bitmap, ost_idx))
+ !cfs_bitmap_check(m->lod_ost_bitmap, ost_idx))
continue;
/* Fail Check before osc_precreate() is called
rc = lod_statfs_and_check(env, m, ost_idx, sfs);
if (rc) {
/* this OSP doesn't feel well */
- CERROR("can't statfs #%u: %d\n", ost_idx, rc);
- continue;
- }
-
- /*
- * skip empty devices - usually it means inactive device
- */
- if (sfs->os_blocks == 0) {
- QOS_DEBUG("#%d: inactive\n", ost_idx);
continue;
}
* We expect number of precreated objects in f_ffree at
* the first iteration, skip OSPs with no objects ready
*/
- if (sfs->os_ffree == 0 && speed == 0) {
+ if (sfs->os_fprecreated == 0 && speed == 0) {
QOS_DEBUG("#%d: precreation is empty\n", ost_idx);
continue;
}
/*
* try to use another OSP if this one is degraded
*/
- if (sfs->os_state == OS_STATE_DEGRADED && speed == 0) {
+ if (sfs->os_state == OS_STATE_DEGRADED && speed < 2) {
QOS_DEBUG("#%d: degraded\n", ost_idx);
continue;
}
lo->ldo_stripe[stripe_idx] = o;
stripe_idx++;
- /* We have enough stripes */
- if (stripe_idx == lo->ldo_stripenr)
- break;
}
if ((speed < 2) && (stripe_idx < stripe_cnt_min)) {
/* Try again, allowing slower OSCs */
goto repeat_find;
}
- cfs_up_read(&m->lod_qos.lq_rw_sem);
+ up_read(&m->lod_qos.lq_rw_sem);
if (stripe_idx) {
lo->ldo_stripenr = stripe_idx;
out:
if (pool != NULL) {
- cfs_up_read(&pool_tgt_rw_sem(pool));
+ up_read(&pool_tgt_rw_sem(pool));
/* put back ref got by lod_find_pool() */
lod_pool_putref(pool);
}
struct ost_pool *osts;
ENTRY;
+ rc = lod_qos_ost_in_use_clear(env, lo->ldo_stripenr);
+ if (rc)
+ GOTO(out, rc);
+
if (lo->ldo_pool)
pool = lod_find_pool(m, lo->ldo_pool);
if (pool != NULL) {
- cfs_down_read(&pool_tgt_rw_sem(pool));
+ down_read(&pool_tgt_rw_sem(pool));
osts = &(pool->pool_obds);
} else {
osts = &(m->lod_pool_info);
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
continue;
+ /*
+ * do not put >1 objects on a single OST
+ */
+ if (lod_qos_is_ost_used(env, ost_idx, stripe_num))
+ continue;
+
/* Drop slow OSCs if we can, but not for requested start idx.
*
* This means "if OSC is slow and it is not the requested
rc = lod_statfs_and_check(env, m, ost_idx, sfs);
if (rc) {
/* this OSP doesn't feel well */
- CERROR("can't statfs #%u: %d\n", ost_idx, rc);
continue;
}
/*
- * skip empty devices - usually it means inactive device
- */
- if (sfs->os_blocks == 0)
- continue;
-
- /*
* We expect number of precreated objects in f_ffree at
* the first iteration, skip OSPs with no objects ready
* don't apply this logic to OST specified with stripe_offset
*/
- if (i != 0 && sfs->os_ffree == 0 && speed == 0)
+ if (i != 0 && sfs->os_fprecreated == 0 && speed == 0)
continue;
o = lod_qos_declare_object_on(env, m, ost_idx, th);
rc = -EFBIG;
out:
if (pool != NULL) {
- cfs_up_read(&pool_tgt_rw_sem(pool));
+ up_read(&pool_tgt_rw_sem(pool));
/* put back ref got by lod_find_pool() */
lod_pool_putref(pool);
}
return 1;
}
-/* Alloc objects on osts with optimization based on:
+/* Alloc objects on OSTs with optimization based on:
- free space
- network resources (shared OSS's)
*/
{
struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
- struct lod_ost_desc *ost;
+ struct lod_tgt_desc *ost;
struct dt_object *o;
__u64 total_weight = 0;
int nfound, good_osts, i, rc = 0;
pool = lod_find_pool(m, lo->ldo_pool);
if (pool != NULL) {
- cfs_down_read(&pool_tgt_rw_sem(pool));
+ down_read(&pool_tgt_rw_sem(pool));
osts = &(pool->pool_obds);
} else {
osts = &(m->lod_pool_info);
GOTO(out_nolock, rc = -EAGAIN);
/* Do actual allocation, use write lock here. */
- cfs_down_write(&m->lod_qos.lq_rw_sem);
+ down_write(&m->lod_qos.lq_rw_sem);
/*
* Check again, while we were sleeping on @lq_rw_sem things could
if (rc)
GOTO(out, rc);
+ rc = lod_qos_ost_in_use_clear(env, lo->ldo_stripenr);
+ if (rc)
+ GOTO(out, rc);
+
good_osts = 0;
/* Find all the OSTs that are valid stripe candidates */
for (i = 0; i < osts->op_count; i++) {
rc = lod_statfs_and_check(env, m, osts->op_array[i], sfs);
if (rc) {
/* this OSP doesn't feel well */
- CERROR("can't statfs #%u: %d\n", i, rc);
continue;
}
/*
- * skip empty devices - usually it means inactive device
- */
- if (sfs->os_blocks == 0)
- continue;
-
- /*
* skip full devices
*/
if (lod_qos_dev_is_full(sfs))
*/
if (lod_qos_is_ost_used(env, idx, nfound))
continue;
+ lod_qos_ost_in_use(env, nfound, idx);
o = lod_qos_declare_object_on(env, m, idx, th);
if (IS_ERR(o)) {
- CERROR("can't declare new object on #%u: %d\n",
- idx, (int) PTR_ERR(o));
+ QOS_DEBUG("can't declare object on #%u: %d\n",
+ idx, (int) PTR_ERR(o));
continue;
}
- lod_qos_ost_in_use(env, nfound, idx);
lo->ldo_stripe[nfound++] = o;
lod_qos_used(m, osts, idx, &total_weight);
rc = 0;
break;
}
- /* should never satisfy below condition */
if (rc) {
- CERROR("Didn't find any OSTs?\n");
+ /* no OST found on this iteration, give up */
break;
}
}
- LASSERT(nfound == stripe_cnt);
+
+ if (unlikely(nfound != stripe_cnt)) {
+ /*
+ * when the decision to use weighted algorithm was made
+ * we had enough appropriate OSPs, but this state can
+ * change anytime (no space on OST, broken connection, etc)
+ * so it's possible OSP won't be able to provide us with
+ * an object due to just changed state
+ */
+ LCONSOLE_INFO("wanted %d, found %d\n", stripe_cnt, nfound);
+ for (i = 0; i < nfound; i++) {
+ LASSERT(lo->ldo_stripe[i]);
+ lu_object_put(env, &lo->ldo_stripe[i]->do_lu);
+ lo->ldo_stripe[i] = NULL;
+ }
+
+ /* makes sense to rebalance next time */
+ m->lod_qos.lq_dirty = 1;
+ m->lod_qos.lq_same_space = 0;
+
+ rc = -EAGAIN;
+ }
out:
- cfs_up_write(&m->lod_qos.lq_rw_sem);
+ up_write(&m->lod_qos.lq_rw_sem);
out_nolock:
if (pool != NULL) {
- cfs_up_read(&pool_tgt_rw_sem(pool));
+ up_read(&pool_tgt_rw_sem(pool));
/* put back ref got by lod_find_pool() */
lod_pool_putref(pool);
}
v1 = buf->lb_buf;
magic = v1->lmm_magic;
- if (magic == __swab32(LOV_USER_MAGIC_V1))
+ if (magic == __swab32(LOV_USER_MAGIC_V1)) {
lustre_swab_lov_user_md_v1(v1);
- else if (magic == __swab32(LOV_USER_MAGIC_V3))
+ magic = v1->lmm_magic;
+ } else if (magic == __swab32(LOV_USER_MAGIC_V3)) {
+ v3 = buf->lb_buf;
lustre_swab_lov_user_md_v3(v3);
+ magic = v3->lmm_magic;
+ }
if (unlikely(magic != LOV_MAGIC_V1 && magic != LOV_MAGIC_V3)) {
/* try to use as fully defined striping */
v3 = buf->lb_buf;
lod_object_set_pool(lo, v3->lmm_pool_name);
+ /* In the function below, .hs_keycmp resolves to
+ * pool_hashkey_keycmp() */
+ /* coverity[overrun-buffer-val] */
pool = lod_find_pool(d, v3->lmm_pool_name);
if (pool != NULL) {
if (lo->ldo_def_stripe_offset !=
GOTO(out, rc = -ENOMEM);
lo->ldo_stripes_allocated = lo->ldo_stripenr;
- rc = lod_qos_ost_in_use_clear(env, lo->ldo_stripenr);
- if (rc)
- GOTO(out, rc);
-
- lod_getref(d);
+ lod_getref(&d->lod_ost_descs);
/* XXX: support for non-0 files w/o objects */
if (lo->ldo_def_stripe_offset >= d->lod_desc.ld_tgt_count) {
lod_qos_statfs_update(env, d);
rc = lod_alloc_rr(env, lo, flag, th);
} else
rc = lod_alloc_specific(env, lo, flag, th);
- lod_putref(d);
+ lod_putref(d, &d->lod_ost_descs);
} else {
/*
* lod_qos_parse_config() found supplied buf as a predefined