*
* lustre/lod/lod_qos.c
*
+ * Implementation of different allocation algorithm used
+ * to distribute objects and data among OSTs.
*/
#define DEBUG_SUBSYSTEM S_LOV
#define TGT_BAVAIL(i) (OST_TGT(lod,i)->ltd_statfs.os_bavail * \
OST_TGT(lod,i)->ltd_statfs.os_bsize)
+/**
+ * Add a new target to Quality of Service (QoS) target table.
+ *
+ * Add a new OST target to the structure representing an OSS. Resort the list
+ * of known OSSs by the number of OSTs attached to each OSS. The OSS list is
+ * protected internally and no external locking is required.
+ *
+ * \param[in] lod LOD device
+ * \param[in] ost_desc OST description
+ *
+ * \retval 0 on success
+ * \retval -ENOMEM on error
+ */
int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
{
struct lod_qos_oss *oss = NULL, *temposs;
RETURN(rc);
}
+/**
+ * Remove OST target from QoS table.
+ *
+ * Removes given OST target from QoS table and releases related OSS structure
+ * if no OSTs remain on the OSS.
+ *
+ * \param[in] lod LOD device
+ * \param[in] ost_desc OST description
+ *
+ * \retval 0 on success
+ * \retval -ENOENT if no OSS was found
+ */
int qos_del_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
{
struct lod_qos_oss *oss;
RETURN(rc);
}
+/**
+ * Check whether the target is available for new OST objects.
+ *
+ * Request statfs data from the given target and verify it's active and not
+ * read-only. If so, then it can be used to place new OST objects. This
+ * function also maintains the number of active/inactive targets and sets
+ * dirty flags if those numbers change so others can run re-balance procedures.
+ * No external locking is required.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] d LOD device
+ * \param[in] index index of OST target to check
+ * \param[out] sfs buffer for statfs data
+ *
+ * \retval 0 if the target is good
+ * \retval negative negated errno on error
+
+ */
static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d,
int index, struct obd_statfs *sfs)
{
RETURN(rc);
}
+/**
+ * Maintain per-target statfs data.
+ *
+ * The function refreshes statfs data for all the targets every N seconds.
+ * The actual N is controlled via procfs and set to LOV_DESC_QOS_MAXAGE_DEFAULT
+ * initially.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] lod LOD device
+ */
static void lod_qos_statfs_update(const struct lu_env *env,
struct lod_device *lod)
{
EXIT;
}
-/* Recalculate per-object penalties for OSSs and OSTs,
- depends on size of each ost in an oss */
+/**
+ * Calculate per-OST and per-OSS penalties
+ *
+ * Re-calculate penalties when the configuration changes, active targets
+ * change and after statfs refresh (all these are reflected by lq_dirty flag).
+ * On every OST and OSS: decay the penalty by half for every 8x the update
+ * interval that the device has been idle. That gives lots of time for the
+ * statfs information to be updated (which the penalty is only a proxy for),
+ * and avoids penalizing OSS/OSTs under light load.
+ * See lod_qos_calc_weight() for how penalties are factored into the weight.
+ *
+ * \param[in] lod LOD device
+ *
+ * \retval 0 on success
+ * \retval -EAGAIN the number of OSTs isn't enough
+ */
static int lod_qos_calc_ppo(struct lod_device *lod)
{
struct lod_qos_oss *oss;
/* find bavail on each OSS */
list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list)
- oss->lqo_bavail = 0;
+ oss->lqo_bavail = 0;
lod->lod_qos.lq_active_oss_count = 0;
/*
age > 32 * lod->lod_desc.ld_qos_maxage)
OST_TGT(lod,i)->ltd_qos.ltq_penalty = 0;
else if (age > lod->lod_desc.ld_qos_maxage)
- /* Decay the penalty by half for every 8x the update
- * interval that the device has been idle. That gives
- * lots of time for the statfs information to be
- * updated (which the penalty is only a proxy for),
- * and avoids penalizing OSS/OSTs under light load. */
+ /* Decay OST penalty. */
OST_TGT(lod,i)->ltd_qos.ltq_penalty >>=
(age / lod->lod_desc.ld_qos_maxage);
}
age > 32 * lod->lod_desc.ld_qos_maxage)
oss->lqo_penalty = 0;
else if (age > lod->lod_desc.ld_qos_maxage)
- /* Decay the penalty by half for every 8x the update
- * interval that the device has been idle. That gives
- * lots of time for the statfs information to be
- * updated (which the penalty is only a proxy for),
- * and avoids penalizing OSS/OSTs under light load. */
+ /* Decay OSS penalty. */
oss->lqo_penalty >>= age / lod->lod_desc.ld_qos_maxage;
}
RETURN(rc);
}
+/**
+ * Calculate weight for a given OST target.
+ *
+ * The final OST weight is the number of bytes available minus the OST and
+ * OSS penalties. See lod_qos_calc_ppo() for how penalties are calculated.
+ *
+ * \param[in] lod LOD device, where OST targets are listed
+ * \param[in] i OST target index
+ *
+ * \retval 0
+ */
static int lod_qos_calc_weight(struct lod_device *lod, int i)
{
__u64 temp, temp2;
- /* Final ost weight = TGT_BAVAIL - ost_penalty - oss_penalty */
temp = TGT_BAVAIL(i);
temp2 = OST_TGT(lod,i)->ltd_qos.ltq_penalty +
OST_TGT(lod,i)->ltd_qos.ltq_oss->lqo_penalty;
return 0;
}
-/* We just used this index for a stripe; adjust everyone's weights */
+/**
+ * Re-calculate weights.
+ *
+ * The function is called when some OST target was used for a new object. In
+ * this case we should re-calculate all the weights to keep new allocations
+ * balanced well.
+ *
+ * \param[in] lod LOD device
+ * \param[in] osts OST pool where a new object was placed
+ * \param[in] index OST target where a new object was placed
+ * \param[out] total_wt new total weight for the pool
+ *
+ * \retval 0
+ */
static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts,
__u32 index, __u64 *total_wt)
{
}
#define LOV_QOS_EMPTY ((__u32)-1)
-/* compute optimal round-robin order, based on OSTs per OSS */
+
+/**
+ * Calculate optimal round-robin order with regard to OSSes.
+ *
+ * Place all the OSTs from pool \a src_pool in a special array to be used for
+ * round-robin (RR) stripe allocation. The placement algorithm interleaves
+ * OSTs from the different OSSs so that RR allocation can balance OSSs evenly.
+ * Resorts the targets when the number of active targets changes (because of
+ * a new target or activation/deactivation).
+ *
+ * \param[in] lod LOD device
+ * \param[in] src_pool OST pool
+ * \param[in] lqr round-robin list
+ *
+ * \retval 0 on success
+ * \retval -ENOMEM fails to allocate the array
+ */
static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool,
struct lod_qos_rr *lqr)
{
}
/**
- * A helper function to:
- * create in-core lu object on the specified OSP
- * declare creation of the object
- * IMPORTANT: at this stage object is anonymouos - it has no fid assigned
- * this is a workaround till we have natural FIDs on OST
- *
- * at this point we want to declare (reserve) object for us as
- * we can't block at execution (when create method is called).
- * otherwise we'd block whole transaction batch
+ * Instantiate and declare creation of a new object.
+ *
+ * The function instantiates LU representation for a new object on the
+ * specified device. Also it declares an intention to create that
+ * object on the storage target.
+ *
+ * Note lu_object_anon() is used which is a trick with regard to LU/OSD
+ * infrastructure - in the existing precreation framework we can't assign FID
+ * at this moment, we do this later once a transaction is started. So the
+ * special method instantiates FID-less object in the cache and later it
+ * will get a FID and proper placement in LU cache.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] d LOD device
+ * \param[in] ost_idx OST target index where the object is being created
+ * \param[in] th transaction handle
+ *
+ * \retval object ptr on success, ERR_PTR() otherwise
*/
static struct dt_object *lod_qos_declare_object_on(const struct lu_env *env,
struct lod_device *d,
RETURN(dt);
}
+/**
+ * Calculate a minimum acceptable stripe count.
+ *
+ * Return an acceptable stripe count depending on flag LOV_USES_DEFAULT_STRIPE:
+ * all stripes or 3/4 of stripes.
+ *
+ * \param[in] stripe_cnt number of stripes requested
+ * \param[in] flags 0 or LOV_USES_DEFAULT_STRIPE
+ *
+ * \retval acceptable stripecount
+ */
static int min_stripe_count(__u32 stripe_cnt, int flags)
{
return (flags & LOV_USES_DEFAULT_STRIPE ?
#define LOV_CREATE_RESEED_MULT 30
#define LOV_CREATE_RESEED_MIN 2000
+/**
+ * Check if an OST is full.
+ *
+ * Check whether an OST should be considered full based
+ * on the given statfs data.
+ *
+ * \param[in] msfs statfs data
+ *
+ * \retval false not full
+ * \retval true full
+ */
static int inline lod_qos_dev_is_full(struct obd_statfs *msfs)
{
__u64 used;
return (msfs->os_bavail < used);
}
+/**
+ * Initialize temporary OST-in-use array.
+ *
+ * Allocate or extend the array used to mark targets already assigned to a new
+ * striping so they are not used more than once.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] stripes number of items needed in the array
+ *
+ * \retval 0 on success
+ * \retval -ENOMEM on error
+ */
static inline int lod_qos_ost_in_use_clear(const struct lu_env *env,
__u32 stripes)
{
return 0;
}
-static inline void lod_qos_ost_in_use(const struct lu_env *env, int idx, int ost)
+/**
+ * Remember a target in the array of used targets.
+ *
+ * Mark the given target as used for a new striping being created. The status
+ * of an OST in a striping can be checked with lod_qos_is_ost_used().
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] idx index in the array
+ * \param[in] ost OST target index to mark as used
+ */
+static inline void lod_qos_ost_in_use(const struct lu_env *env,
+ int idx, int ost)
{
struct lod_thread_info *info = lod_env_info(env);
int *osts = info->lti_ea_store;
osts[idx] = ost;
}
+/**
+ * Check is OST used in a striping.
+ *
+ * Checks whether OST with the given index is marked as used in the temporary
+ * array (see lod_qos_ost_in_use()).
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] ost OST target index to check
+ * \param[in] stripes the number of items used in the array already
+ *
+ * \retval 0 not used
+ * \retval 1 used
+ */
static int lod_qos_is_ost_used(const struct lu_env *env, int ost, __u32 stripes)
{
struct lod_thread_info *info = lod_env_info(env);
return 0;
}
-/* Allocate objects on OSTs with round-robin algorithm */
+/**
+ * Allocate a striping using round-robin algorigthm.
+ *
+ * Allocates a new striping using round-robin algorithm. The function refreshes
+ * all the internal structures (statfs cache, array of available OSTs sorted
+ * with regard to OSS, etc). The number of stripes required is taken from the
+ * object (must be prepared by the caller), but can change if the flag
+ * LOV_USES_DEFAULT_STRIPE is supplied. The caller should ensure nobody else
+ * is trying to create a striping on the object in parallel. All the internal
+ * structures (like pools, etc) are protected and no additional locking is
+ * required. The function succeeds even if a single stripe is allocated. To save
+ * time we give priority to targets which already have objects precreated.
+ * Full OSTs are skipped (see lod_qos_dev_is_full() for the details).
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] lo LOD object
+ * \param[out] stripe striping created
+ * \param[in] flags allocation flags (0 or LOV_USES_DEFAULT_STRIPE)
+ * \param[in] th transaction handle
+ *
+ * \retval 0 on success
+ * \retval -ENOSPC if not enough OSTs are found
+ * \retval negative negated errno for other failures
+ */
static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
struct dt_object **stripe, int flags,
struct thandle *th)
RETURN(rc);
}
-/* alloc objects on osts with specific stripe offset */
+/**
+ * Allocate a striping on a predefined set of OSTs.
+ *
+ * Allocates new striping starting from OST provided lo->ldo_def_stripe_offset.
+ * Full OSTs are not considered. The exact order of OSTs is not important and
+ * varies depending on OST status. The allocation procedure prefers the targets
+ * with precreated objects ready. The number of stripes needed and stripe
+ * offset are taken from the object. If that number can not be met, then the
+ * function returns a failure and then it's the caller's responsibility to
+ * release the stripes allocated. All the internal structures are protected,
+ * but no concurrent allocation is allowed on the same objects.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] lo LOD object
+ * \param[out] stripe striping created
+ * \param[in] flags not used
+ * \param[in] th transaction handle
+ *
+ * \retval 0 on success
+ * \retval -E2BIG if no enough OSTs are found
+ * \retval -EINVAL requested offset is invalid
+ * \retval negative negated errno on error
+ */
static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo,
struct dt_object **stripe, int flags,
struct thandle *th)
RETURN(rc);
}
+/**
+ * Check whether QoS allocation should be used.
+ *
+ * A simple helper to decide when QoS allocation should be used:
+ * if it's just a single available target or the used space is
+ * evenly distributed among the targets at the moment, then QoS
+ * allocation algorithm should not be used.
+ *
+ * \param[in] lod LOD device
+ *
+ * \retval 0 should not be used
+ * \retval 1 should be used
+ */
static inline int lod_qos_is_usable(struct lod_device *lod)
{
#ifdef FORCE_QOS
return 1;
}
-/* Alloc objects on OSTs with optimization based on:
- - free space
- - network resources (shared OSS's)
+/**
+ * Allocate a striping using an algorithm with weights.
+ *
+ * The function allocates OST objects to create a striping. The algorithm
+ * used is based on weights (currently only using the free space), and it's
+ * trying to ensure the space is used evenly by OSTs and OSSs. The striping
+ * configuration (# of stripes, offset,
+ * pool) is taken from the object and is prepared by the caller.
+ * If LOV_USES_DEFAULT_STRIPE is not passed and prepared configuration can't
+ * be met due to too few OSTs, then allocation fails. If the flag is
+ * passed and less than 75% of the requested number of stripes can be
+ * allocated, then allocation fails.
+ * No concurrent allocation is allowed on the object and this must be
+ * ensured by the caller. All the internal structures are protected by the
+ * function.
+ * The algorithm has two steps: find available OSTs and calucate their weights,
+ * then select the OSTs the weights used as the probability. An OST with a
+ * higher weight is proportionately more likely to be selected than one with
+ * a lower weight.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] lo LOD object
+ * \param[out] stripe striping created
+ * \param[in] flags 0 or LOV_USES_DEFAULT_STRIPE
+ * \param[in] th transaction handle
+ *
+ * \retval 0 on success
+ * \retval -E2BIG if no enough OSTs are found
+ * \retval -EINVAL requested offset is invalid
+ * \retval negative negated errno on error
*/
static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo,
struct dt_object **stripe, int flags,
RETURN(rc);
}
-/* Find the max stripecount we should use */
+/**
+ * Find largest stripe count the caller can use.
+ *
+ * Find the maximal possible stripe count not greater than \a stripe_count.
+ * Sometimes suggested stripecount can't be reached for a number of reasons:
+ * lack of enough active OSTs or the backend does not support EAs that large.
+ * If the passed one is 0, then the filesystem's default one is used.
+ *
+ * \param[in] lod LOD device
+ * \param[in] magic the format if striping
+ * \param[in] stripe_count count the caller would like to use
+ *
+ * \retval the maximum usable stripe count
+ */
static __u16 lod_get_stripecnt(struct lod_device *lod, __u32 magic,
__u16 stripe_count)
{
return (stripe_count < max_stripes) ? stripe_count : max_stripes;
}
+/**
+ * Create in-core respresentation for a fully-defined striping
+ *
+ * When the caller passes a fully-defined striping (i.e. everything including
+ * OST object FIDs are defined), then we still need to instantiate LU-cache
+ * with the objects representing the stripes defined. This function completes
+ * that task.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] mo LOD object
+ * \param[in] buf buffer containing the striping
+ *
+ * \retval 0 on success
+ * \retval negative negated errno on error
+ */
static int lod_use_defined_striping(const struct lu_env *env,
struct lod_object *mo,
const struct lu_buf *buf)
RETURN(rc);
}
+/**
+ * Parse suggested striping configuration.
+ *
+ * The caller gets a suggested striping configuration from a number of sources
+ * including per-directory default and applications. Then it needs to verify
+ * the suggested striping is valid, apply missing bits and store the resulting
+ * configuration in the object to be used by the allocator later. Must not be
+ * called concurrently against the same object. It's OK to provide a
+ * fully-defined striping.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] lo LOD object
+ * \param[in] buf buffer containing the striping
+ *
+ * \retval 0 on success
+ * \retval negative negated errno on error
+ */
static int lod_qos_parse_config(const struct lu_env *env,
struct lod_object *lo,
const struct lu_buf *buf)
RETURN(0);
}
-/*
- * buf should be NULL or contain striping settings
+/**
+ * Create a striping for an obejct.
+ *
+ * The function creates a new striping for the object. A buffer containing
+ * configuration hints can be provided optionally. The function tries QoS
+ * algorithm first unless free space is distributed evenly among OSTs, but
+ * by default RR algorithm is preferred due to internal concurrency (QoS is
+ * serialized). The caller must ensure no concurrent calls to the function
+ * are made against the same object.
+ *
+ * \param[in] env execution environment for this thread
+ * \param[in] lo LOD object
+ * \param[in] attr attributes OST objects will be declared with
+ * \param[in] buf suggested striping configuration or NULL
+ * \param[in] th transaction handle
+ *
+ * \retval 0 on success
+ * \retval negative negated errno on error
*/
int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo,
struct lu_attr *attr, const struct lu_buf *buf,