* Copyright 2009 Sun Microsystems, Inc. All rights reserved
* Use is subject to license terms.
*
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2012, 2015, Intel Corporation.
*/
/*
* This file is part of Lustre, http://www.lustre.org/
#include <asm/div64.h>
#include <libcfs/libcfs.h>
-#include <obd_class.h>
#include <lustre/lustre_idl.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
+
#include "lod_internal.h"
/*
spin_lock(&d->lod_desc_lock);
if (ost->ltd_active) {
ost->ltd_active = 0;
+ if (rc == -ENOTCONN)
+ ost->ltd_connecting = 1;
+
LASSERT(d->lod_desc.ld_active_tgt_count > 0);
d->lod_desc.ld_active_tgt_count--;
d->lod_qos.lq_dirty = 1;
spin_lock(&d->lod_desc_lock);
if (ost->ltd_active == 0) {
ost->ltd_active = 1;
+ ost->ltd_connecting = 0;
d->lod_desc.ld_active_tgt_count++;
d->lod_qos.lq_dirty = 1;
d->lod_qos.lq_rr.lqr_dirty = 1;
RETURN(0);
}
+void lod_qos_rr_init(struct lod_qos_rr *lqr)
+{
+ spin_lock_init(&lqr->lqr_alloc);
+ lqr->lqr_dirty = 1;
+}
+
+
#define LOV_QOS_EMPTY ((__u32)-1)
/**
return 0;
}
+static int lod_check_and_reserve_ost(const struct lu_env *env,
+ struct lod_device *m,
+ struct obd_statfs *sfs, __u32 ost_idx,
+ __u32 speed, __u32 *s_idx,
+ struct dt_object **stripe,
+ struct thandle *th)
+{
+ struct dt_object *o;
+ __u32 stripe_idx = *s_idx;
+ int rc;
+
+ rc = lod_statfs_and_check(env, m, ost_idx, sfs);
+ if (rc) {
+ /* this OSP doesn't feel well */
+ goto out_return;
+ }
+
+ /*
+ * skip full devices
+ */
+ if (lod_qos_dev_is_full(sfs)) {
+ QOS_DEBUG("#%d is full\n", ost_idx);
+ goto out_return;
+ }
+
+ /*
+ * We expect number of precreated objects in f_ffree at
+ * the first iteration, skip OSPs with no objects ready
+ */
+ if (sfs->os_fprecreated == 0 && speed == 0) {
+ QOS_DEBUG("#%d: precreation is empty\n", ost_idx);
+ goto out_return;
+ }
+
+ /*
+ * try to use another OSP if this one is degraded
+ */
+ if (sfs->os_state & OS_STATE_DEGRADED && speed < 2) {
+ QOS_DEBUG("#%d: degraded\n", ost_idx);
+ goto out_return;
+ }
+
+ /*
+ * do not put >1 objects on a single OST
+ */
+ if (speed && lod_qos_is_ost_used(env, ost_idx, stripe_idx))
+ goto out_return;
+
+ o = lod_qos_declare_object_on(env, m, ost_idx, th);
+ if (IS_ERR(o)) {
+ CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
+ ost_idx, (int) PTR_ERR(o));
+ rc = PTR_ERR(o);
+ goto out_return;
+ }
+
+ /*
+ * We've successfully declared (reserved) an object
+ */
+ lod_qos_ost_in_use(env, stripe_idx, ost_idx);
+ stripe[stripe_idx] = o;
+ stripe_idx++;
+ *s_idx = stripe_idx;
+
+out_return:
+ return rc;
+}
+
/**
* Allocate a striping using round-robin algorithm.
*
struct pool_desc *pool = NULL;
struct ost_pool *osts;
struct lod_qos_rr *lqr;
- struct dt_object *o;
unsigned int i, array_idx;
int rc;
__u32 ost_start_idx_temp;
int speed = 0;
+ int ost_connecting = 0;
__u32 stripe_idx = 0;
__u32 stripe_cnt = lo->ldo_stripenr;
__u32 stripe_cnt_min = min_stripe_count(stripe_cnt, flags);
if (rc)
GOTO(out, rc);
+ down_read(&m->lod_qos.lq_rw_sem);
+ spin_lock(&lqr->lqr_alloc);
if (--lqr->lqr_start_count <= 0) {
lqr->lqr_start_idx = cfs_rand() % osts->op_count;
lqr->lqr_start_count =
if (stripe_cnt > 1 && (osts->op_count % stripe_cnt) != 1)
++lqr->lqr_offset_idx;
}
- down_read(&m->lod_qos.lq_rw_sem);
ost_start_idx_temp = lqr->lqr_start_idx;
repeat_find:
- array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) %
- osts->op_count;
QOS_DEBUG("pool '%s' want %d startidx %d startcnt %d offset %d "
- "active %d count %d arrayidx %d\n",
+ "active %d count %d\n",
lo->ldo_pool ? lo->ldo_pool : "",
stripe_cnt, lqr->lqr_start_idx, lqr->lqr_start_count,
- lqr->lqr_offset_idx, osts->op_count, osts->op_count,
- array_idx);
+ lqr->lqr_offset_idx, osts->op_count, osts->op_count);
- for (i = 0; i < osts->op_count && stripe_idx < lo->ldo_stripenr;
- i++, array_idx = (array_idx + 1) % osts->op_count) {
+ for (i = 0; i < osts->op_count && stripe_idx < lo->ldo_stripenr; i++) {
+ array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) %
+ osts->op_count;
++lqr->lqr_start_idx;
ost_idx = lqr->lqr_pool.op_array[array_idx];
if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0)
continue;
- rc = lod_statfs_and_check(env, m, ost_idx, sfs);
- if (rc) {
- /* this OSP doesn't feel well */
- continue;
- }
-
- /*
- * skip full devices
- */
- if (lod_qos_dev_is_full(sfs)) {
- QOS_DEBUG("#%d is full\n", ost_idx);
- continue;
- }
-
- /*
- * We expect number of precreated objects in f_ffree at
- * the first iteration, skip OSPs with no objects ready
- */
- if (sfs->os_fprecreated == 0 && speed == 0) {
- QOS_DEBUG("#%d: precreation is empty\n", ost_idx);
- continue;
- }
-
- /*
- * try to use another OSP if this one is degraded
- */
- if (sfs->os_state & OS_STATE_DEGRADED && speed < 2) {
- QOS_DEBUG("#%d: degraded\n", ost_idx);
- continue;
- }
-
- /*
- * do not put >1 objects on a single OST
- */
- if (speed && lod_qos_is_ost_used(env, ost_idx, stripe_idx))
- continue;
-
- o = lod_qos_declare_object_on(env, m, ost_idx, th);
- if (IS_ERR(o)) {
- CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n",
- ost_idx, (int) PTR_ERR(o));
- rc = PTR_ERR(o);
- continue;
- }
-
- /*
- * We've successfully declared (reserved) an object
- */
- lod_qos_ost_in_use(env, stripe_idx, ost_idx);
- stripe[stripe_idx] = o;
- stripe_idx++;
+ spin_unlock(&lqr->lqr_alloc);
+ rc = lod_check_and_reserve_ost(env, m, sfs, ost_idx, speed,
+ &stripe_idx, stripe, th);
+ spin_lock(&lqr->lqr_alloc);
+ if (rc != 0 && OST_TGT(m, ost_idx)->ltd_connecting)
+ ost_connecting = 1;
}
if ((speed < 2) && (stripe_idx < stripe_cnt_min)) {
/* Try again, allowing slower OSCs */
speed++;
lqr->lqr_start_idx = ost_start_idx_temp;
+
+ ost_connecting = 0;
goto repeat_find;
}
+ spin_unlock(&lqr->lqr_alloc);
up_read(&m->lod_qos.lq_rw_sem);
if (stripe_idx) {
rc = 0;
} else {
/* nobody provided us with a single object */
- rc = -ENOSPC;
+ if (ost_connecting)
+ rc = -EINPROGRESS;
+ else
+ rc = -ENOSPC;
}
out:
/**
* Allocate a striping on a predefined set of OSTs.
*
- * Allocates new striping starting from OST provided lo->ldo_def_stripe_offset.
+ * Allocates new layout starting from OST index in lo->ldo_def_stripe_offset.
* Full OSTs are not considered. The exact order of OSTs is not important and
* varies depending on OST status. The allocation procedure prefers the targets
* with precreated objects ready. The number of stripes needed and stripe
- * offset are taken from the object. If that number can not be met, then the
- * function returns a failure and then it's the caller's responsibility to
+ * offset are taken from the object. If that number cannot be met, then the
+ * function returns an error and then it's the caller's responsibility to
* release the stripes allocated. All the internal structures are protected,
* but no concurrent allocation is allowed on the same objects.
*
* \param[in] th transaction handle
*
* \retval 0 on success
- * \retval -E2BIG if no enough OSTs are found
+ * \retval -ENOSPC if no OST objects are available at all
+ * \retval -EFBIG if not enough OST objects are found
* \retval -EINVAL requested offset is invalid
- * \retval negative negated errno on error
+ * \retval negative errno on failure
*/
static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo,
struct dt_object **stripe, int flags,
/* If we were passed specific striping params, then a failure to
* meet those requirements is an error, since we can't reallocate
* that memory (it might be part of a larger array or something).
- *
- * We can only get here if lsm_stripe_count was originally > 1.
*/
CERROR("can't lstripe objid "DFID": have %d want %u\n",
PFID(lu_object_fid(lod2lu_obj(lo))), stripe_num,
lo->ldo_stripenr);
- rc = -EFBIG;
+ rc = stripe_num == 0 ? -ENOSPC : -EFBIG;
out:
if (pool != NULL) {
up_read(&pool_tgt_rw_sem(pool));
* The function allocates OST objects to create a striping. The algorithm
* used is based on weights (currently only using the free space), and it's
* trying to ensure the space is used evenly by OSTs and OSSs. The striping
- * configuration (# of stripes, offset,
- * pool) is taken from the object and is prepared by the caller.
+ * configuration (# of stripes, offset, pool) is taken from the object and
+ * is prepared by the caller.
+ *
* If LOV_USES_DEFAULT_STRIPE is not passed and prepared configuration can't
- * be met due to too few OSTs, then allocation fails. If the flag is
- * passed and less than 75% of the requested number of stripes can be
- * allocated, then allocation fails.
- * No concurrent allocation is allowed on the object and this must be
- * ensured by the caller. All the internal structures are protected by the
- * function.
- * The algorithm has two steps: find available OSTs and calucate their weights,
- * then select the OSTs the weights used as the probability. An OST with a
- * higher weight is proportionately more likely to be selected than one with
- * a lower weight.
+ * be met due to too few OSTs, then allocation fails. If the flag is passed
+ * fewer than 3/4 of the requested number of stripes can be allocated, then
+ * allocation fails.
+ *
+ * No concurrent allocation is allowed on the object and this must be ensured
+ * by the caller. All the internal structures are protected by the function.
+ *
+ * The algorithm has two steps: find available OSTs and calculate their
+ * weights, then select the OSTs with their weights used as the probability.
+ * An OST with a higher weight is proportionately more likely to be selected
+ * than one with a lower weight.
*
* \param[in] env execution environment for this thread
* \param[in] lo LOD object
* \param[in] th transaction handle
*
* \retval 0 on success
- * \retval -E2BIG if no enough OSTs are found
- * \retval -EINVAL requested offset is invalid
- * \retval negative negated errno on error
+ * \retval -EAGAIN not enough OSTs are found for specified stripe count
+ * \retval -EINVAL requested OST index is invalid
+ * \retval negative errno on failure
*/
static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
struct dt_object **stripe, int flags,
rand = 0;
}
- /* On average, this will hit larger-weighted osts more often.
- 0-weight osts will always get used last (only when rand=0) */
+ /* On average, this will hit larger-weighted OSTs more often.
+ * 0-weight OSTs will always get used last (only when rand=0) */
for (i = 0; i < osts->op_count; i++) {
__u32 idx = osts->op_array[i];