__u32 ld_pattern; /* PATTERN_RAID0, PATTERN_RAID1 */
__u64 ld_default_stripe_size; /* in bytes */
__u64 ld_default_stripe_offset; /* in bytes */
+ __u32 ld_qos_threshold; /* in MB */
+ __u32 ld_qos_maxage; /* in second */
__u32 ld_padding_1; /* also fix lustre_swab_lov_desc */
__u32 ld_padding_2; /* also fix lustre_swab_lov_desc */
- __u32 ld_padding_3; /* also fix lustre_swab_lov_desc */
- __u32 ld_padding_4; /* also fix lustre_swab_lov_desc */
struct obd_uuid ld_uuid;
};
__u32 ltd_gen;
struct obd_export *ltd_exp;
int active; /* is this target up for requests */
+ int index; /* index of target array in lov_obd */
+ struct list_head qos_bavail_list; /* link entry to lov_obd */
};
struct lov_obd {
int bufsize;
int refcount;
unsigned int lo_catalog_loaded:1;
+ struct list_head qos_bavail_list; /* tgts list, sorted by available space, protected by lov_lock */
struct lov_tgt_desc *tgts;
};
#define IOC_LOV_SET_OSC_ACTIVE _IOWR('g', 50, long)
#define IOC_LOV_MAX_NR 50
+#define QOS_DEFAULT_THRESHOLD 10 /* MB */
+#define QOS_DEFAULT_MAXAGE 5 /* Seconds */
+
#endif
/* lov_qos.c */
void qos_shrink_lsm(struct lov_request_set *set);
-int qos_prep_create(struct lov_obd *lov, struct lov_request_set *set,
- int newea);
+int qos_prep_create(struct obd_export *exp, struct lov_request_set *set);
+void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs);
int qos_remedy_create(struct lov_request_set *set, struct lov_request *req);
/* lov_request.c */
RETURN(-ENOMEM);
}
- memset(tgt, 0, bufsize);
if (lov->tgts) {
+ int i;
memcpy(tgt, lov->tgts, lov->bufsize);
+ LASSERT(index == lov->desc.ld_tgt_count);
+ for (i = 0; i < index; i++) {
+ INIT_LIST_HEAD(&tgt[i].qos_bavail_list);
+ list_splice(&lov->tgts[i].qos_bavail_list,
+ &tgt[i].qos_bavail_list);
+ }
OBD_FREE(lov->tgts, lov->bufsize);
}
tgt->uuid = *uuidp;
/* XXX - add a sanity check on the generation number. */
tgt->ltd_gen = gen;
+ tgt->index = index;
+ INIT_LIST_HEAD(&tgt->qos_bavail_list);
old_count = lov->desc.ld_tgt_count;
if (index >= lov->desc.ld_tgt_count)
struct lustre_cfg *lcfg = buf;
struct lov_desc *desc;
struct lov_obd *lov = &obd->u.lov;
- int count;
+ struct lov_tgt_desc *tgts;
+ int count, i;
ENTRY;
if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
CERROR("Out of memory\n");
RETURN(-EINVAL);
}
- memset(lov->tgts, 0, lov->bufsize);
+ for (i = 0, tgts = lov->tgts; i < max(count, 1); i++, tgts++) {
+ tgts->index = i;
+ INIT_LIST_HEAD(&tgts->qos_bavail_list);
+ }
desc->ld_active_tgt_count = 0;
lov->desc = *desc;
spin_lock_init(&lov->lov_lock);
+ INIT_LIST_HEAD(&lov->qos_bavail_list);
lprocfs_init_vars(lov, &lvars);
lprocfs_obd_setup(obd, lvars.obd_vars);
/* the LOV expects oa->o_id to be set to the LOV object id */
static int
lov_create(struct obd_export *exp, struct obdo *src_oa,
- struct lov_stripe_md **ea, struct obd_trans_info *oti)
+ struct lov_stripe_md **ea, struct obd_trans_info *oti)
{
struct lov_request_set *set = NULL;
- struct list_head *pos;
struct lov_obd *lov;
+ struct obd_statfs osfs;
+ unsigned long maxage;
+ struct lov_request *req;
int rc = 0;
ENTRY;
lov = &exp->exp_obd->u.lov;
if (!lov->desc.ld_active_tgt_count)
RETURN(-EIO);
+
+ maxage = jiffies - lov->desc.ld_qos_maxage * HZ;
+ obd_statfs(exp->exp_obd, &osfs, maxage);
rc = lov_prep_create_set(exp, ea, src_oa, oti, &set);
if (rc)
RETURN(rc);
- list_for_each (pos, &set->set_list) {
- struct lov_request *req =
- list_entry(pos, struct lov_request, rq_link);
-
+ list_for_each_entry(req, &set->set_list, rq_link) {
/* XXX: LOV STACKING: use real "obj_mdp" sub-data */
rc = obd_create(lov->tgts[req->rq_idx].ltd_exp,
req->rq_oa, &req->rq_md, oti);
rc = err;
continue;
}
+ qos_update(lov, i, &lov_sfs);
if (!set) {
memcpy(osfs, &lov_sfs, sizeof(lov_sfs));
#define DEBUG_SUBSYSTEM S_LOV
#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/random.h>
#else
#include <liblustre.h>
#endif
#define LOV_CREATE_RESEED_MULT 4
#define LOV_CREATE_RESEED_MIN 1000
-/* FIXME use real qos data to prepare the lov create request */
-int qos_prep_create(struct lov_obd *lov, struct lov_request_set *set, int newea)
+/* alloc objects on osts with round-robin algorithm */
+static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt)
{
- static int ost_start_idx, ost_start_count;
+ static int ost_start_count, ost_start_idx;
unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
unsigned ost_active_count = lov->desc.ld_active_tgt_count;
- struct lov_stripe_md *lsm = set->set_md;
- struct obdo *src_oa = set->set_oa;
- int i, rc = 0;
+ int i, *idx_pos = idx_arr;
ENTRY;
-
- LASSERT(src_oa->o_valid & OBD_MD_FLID);
-
- lsm->lsm_object_id = src_oa->o_id;
- if (!lsm->lsm_stripe_size)
- lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
- if (!lsm->lsm_pattern) {
- lsm->lsm_pattern = lov->desc.ld_pattern ?
- lov->desc.ld_pattern : LOV_PATTERN_RAID0;
+
+ if (--ost_start_count <= 0) {
+ ost_start_idx = ll_insecure_random_int();
+ ost_start_count =
+ (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) +
+ LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U);
+ } else if (*stripe_cnt >= lov->desc.ld_active_tgt_count) {
+ /* If we allocate from all of the stripes, make the
+ * next file start on the next OST. */
+ ++ost_start_idx;
}
+ ost_idx = ost_start_idx % ost_count;
- if (newea || lsm->lsm_oinfo[0].loi_ost_idx >= ost_count) {
- if (--ost_start_count <= 0) {
- ost_start_idx = ll_insecure_random_int();
- ost_start_count =
- (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) +
- LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U);
- } else if (lsm->lsm_stripe_count >= ost_active_count) {
- /* If we allocate from all of the stripes, make the
- * next file start on the next OST. */
- ++ost_start_idx;
+ for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
+ ++ost_start_idx;
+
+ if (lov->tgts[ost_idx].active == 0) {
+ CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
+ continue;
}
- ost_idx = ost_start_idx % ost_count;
- } else {
- ost_idx = lsm->lsm_oinfo[0].loi_ost_idx;
+
+ *idx_pos = ost_idx;
+ idx_pos++;
+ /* got enough ost */
+ if (idx_pos - idx_arr == *stripe_cnt)
+ RETURN(0);
}
+ *stripe_cnt = idx_pos - idx_arr;
+ RETURN(0);
+}
- CDEBUG(D_INODE, "allocating %d subobjs for objid "LPX64" at idx %d\n",
- lsm->lsm_stripe_count, lsm->lsm_object_id, ost_idx);
+/* alloc objects on osts with specific stripe offset */
+static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm,
+ int *idx_arr)
+{
+ unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
+ int i, *idx_pos = idx_arr;
+ ENTRY;
+ ost_idx = lsm->lsm_oinfo[0].loi_ost_idx;
for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
- struct lov_request *req;
-
- ++ost_start_idx;
if (lov->tgts[ost_idx].active == 0) {
CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx);
continue;
}
+ *idx_pos = ost_idx;
+ idx_pos++;
+ /* got enough ost */
+ if (idx_pos - idx_arr == lsm->lsm_stripe_count)
+ RETURN(0);
+ }
+ /* If we were passed specific striping params, then a failure to
+ * meet those requirements is an error, since we can't reallocate
+ * that memory (it might be part of a larger array or something).
+ *
+ * We can only get here if lsm_stripe_count was originally > 1.
+ */
+ CERROR("can't lstripe objid "LPX64": have %u want %u\n",
+ lsm->lsm_object_id, idx_pos - idx_arr, lsm->lsm_stripe_count);
+ RETURN(-EFBIG);
+}
+
+/* free space OST must have to be used for object allocation. */
+#define QOS_MIN (lov->desc.ld_qos_threshold << 20)
+
+#define TGT_BAVAIL(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_bavail * \
+ tgt->ltd_exp->exp_obd->obd_osfs.os_bsize)
+#define TGT_FFREE(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_ffree)
+
+/* alloc objects on osts with free space weighted algorithm */
+static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt)
+{
+ struct lov_obd *lov = &exp->exp_obd->u.lov;
+ unsigned ost_count = lov->desc.ld_tgt_count;
+ __u64 cur_bavail, rand, *availspace, total_bavail = 0;
+ int *indexes, nfound, good_osts, i, warn = 0, rc = 0;
+ struct lov_tgt_desc *tgt;
+ int shift, require_stripes = *stripe_cnt;
+ static time_t last_warn = 0;
+ time_t now = cfs_time_current_sec();
+ ENTRY;
+
+ availspace = NULL;
+ indexes = NULL;
+ OBD_ALLOC(availspace, sizeof(__u64) * ost_count);
+ OBD_ALLOC(indexes, sizeof(int) * require_stripes);
+ if (!availspace || !indexes)
+ GOTO(out_free, rc = -EAGAIN);
+
+ spin_lock(&lov->lov_lock);
+
+ /* if free space is below some threshold, just go
+ * to do round-robin allocation */
+ total_bavail = (exp->exp_obd->obd_osfs.os_bavail * \
+ exp->exp_obd->obd_osfs.os_bsize);
+ if (ost_count < 2 || total_bavail <= QOS_MIN) {
+ spin_unlock(&lov->lov_lock);
+ GOTO(out_free, rc = -EAGAIN);
+ }
+
+ /* if each ost has almost same free space, go to
+ * do rr allocation for better creation performance */
+ if (!list_empty(&lov->qos_bavail_list)) {
+ __u64 max, min, val;
+ tgt = list_entry(lov->qos_bavail_list.next,
+ struct lov_tgt_desc, qos_bavail_list);
+ max = TGT_BAVAIL(tgt);
+ tgt = list_entry(lov->qos_bavail_list.prev,
+ struct lov_tgt_desc, qos_bavail_list);
+ min = TGT_BAVAIL(tgt);
+
+ val = (max >= min) ? (max - min) : (min - max);
+ min = (min * 13) >> 8; /* less than 5% of gap */
+
+ if (val < min) {
+ spin_unlock(&lov->lov_lock);
+ GOTO(out_free, rc = -EAGAIN);
+ }
+ } else {
+ spin_unlock(&lov->lov_lock);
+ GOTO(out_free, rc = -EAGAIN);
+ }
+
+ total_bavail = 0;
+ good_osts = 0;
+ /* warn zero available space/inode every 30 min */
+ if (cfs_time_sub(now, last_warn) > 60 * 30)
+ warn = 1;
+ list_for_each_entry(tgt, &lov->qos_bavail_list, qos_bavail_list) {
+ if (!tgt->active)
+ continue;
+ if (!TGT_BAVAIL(tgt)) {
+ if (warn) {
+ CWARN("avail space on %s is zero\n",
+ tgt->uuid.uuid);
+ last_warn = now;
+ }
+ continue;
+ }
+ if (!TGT_FFREE(tgt)) {
+ if (warn) {
+ CWARN("free inode on %s is zero\n",
+ tgt->uuid.uuid);
+ last_warn = now;
+ }
+ continue;
+ }
+ if ((TGT_BAVAIL(tgt) <= QOS_MIN) && (good_osts >= *stripe_cnt))
+ break;
+ availspace[good_osts] = TGT_BAVAIL(tgt);
+ indexes[good_osts] = tgt->index;
+ total_bavail += availspace[good_osts];
+ good_osts++;
+ }
+
+ spin_unlock(&lov->lov_lock);
+
+ if (!total_bavail)
+ GOTO(out_free, rc = -ENOSPC);
+
+ /* if we don't have enough good OSTs, we reduce the stripe count. */
+ if (good_osts < *stripe_cnt)
+ *stripe_cnt = good_osts;
+
+ if (!*stripe_cnt)
+ GOTO(out_free, rc = -EAGAIN);
+
+ nfound = shift = 0;
+ while ((total_bavail >> shift) > 0)
+ shift++;
+ shift++;
+ /* search enough OSTs with free space weighted random allocation */
+ while (nfound < *stripe_cnt) {
+ cur_bavail = 0;
+
+ get_random_bytes(&rand, sizeof(rand));
+ if (shift < 64)
+ rand &= ((1 << shift) - 1);
+ while (rand > total_bavail)
+ rand -= total_bavail;
+
+ for (i = 0; i < good_osts; i++) {
+ cur_bavail += availspace[i];
+ if (cur_bavail >= rand) {
+ total_bavail -= availspace[i];
+ availspace[i] = 0;
+ idx_arr[nfound] = indexes[i];
+ nfound++;
+ break;
+ }
+ }
+ /* should never satisfy below condition */
+ if (cur_bavail == 0)
+ break;
+ }
+ LASSERT(nfound == *stripe_cnt);
+
+out_free:
+ if (availspace)
+ OBD_FREE(availspace, sizeof(__u64) * ost_count);
+ if (indexes)
+ OBD_FREE(indexes, sizeof(int) * require_stripes);
+ if (rc != -EAGAIN)
+ RETURN(rc);
+
+ rc = alloc_rr(lov, idx_arr, stripe_cnt);
+ RETURN(rc);
+}
+/* return new alloced stripe count in success */
+static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm,
+ int newea, int **idx_arr, int *arr_cnt)
+{
+ struct lov_obd *lov = &exp->exp_obd->u.lov;
+ int stripe_cnt = lsm->lsm_stripe_count;
+ int i, rc = 0;
+ int *tmp_arr = NULL;
+ ENTRY;
+
+ *arr_cnt = stripe_cnt;
+ OBD_ALLOC(tmp_arr, *arr_cnt * sizeof(int));
+ if (tmp_arr == NULL)
+ RETURN(-ENOMEM);
+ for (i = 0; i < *arr_cnt; i++)
+ tmp_arr[i] = -1;
+
+ if (newea ||
+ lsm->lsm_oinfo[0].loi_ost_idx >= lov->desc.ld_tgt_count)
+ rc = alloc_qos(exp, tmp_arr, &stripe_cnt);
+ else
+ rc = alloc_specific(lov, lsm, tmp_arr);
+
+ if (rc)
+ GOTO(out_arr, rc);
+
+ *idx_arr = tmp_arr;
+ RETURN(stripe_cnt);
+out_arr:
+ OBD_FREE(tmp_arr, *arr_cnt * sizeof(int));
+ *arr_cnt = 0;
+ RETURN(rc);
+}
+
+static void free_idx_array(int *idx_arr, int arr_cnt)
+{
+ if (arr_cnt)
+ OBD_FREE(idx_arr, arr_cnt * sizeof(int));
+}
+
+int qos_prep_create(struct obd_export *exp, struct lov_request_set *set)
+{
+ struct lov_obd *lov = &exp->exp_obd->u.lov;
+ struct lov_stripe_md *lsm;
+ struct obdo *src_oa = set->set_oa;
+ struct obd_trans_info *oti = set->set_oti;
+ int i, stripes, rc = 0, newea = 0;
+ int *idx_arr, idx_cnt = 0;
+ ENTRY;
+
+ LASSERT(src_oa->o_valid & OBD_MD_FLID);
+
+ if (set->set_md == NULL) {
+ int stripe_cnt = lov_get_stripecnt(lov, 0);
+
+ /* If the MDS file was truncated up to some size, stripe over
+ * enough OSTs to allow the file to be created at that size. */
+ if (src_oa->o_valid & OBD_MD_FLSIZE) {
+ struct lov_tgt_desc *tgt;
+ stripes = 1;
+
+ spin_lock(&lov->lov_lock);
+ list_for_each_entry(tgt, &lov->qos_bavail_list,
+ qos_bavail_list) {
+ if (!tgt->active)
+ continue;
+ if (TGT_BAVAIL(tgt) * stripes > src_oa->o_size)
+ break;
+ stripes++;
+ }
+ spin_unlock(&lov->lov_lock);
+
+ if (stripes < stripe_cnt)
+ stripes = stripe_cnt;
+ } else {
+ stripes = stripe_cnt;
+ }
+
+ rc = lov_alloc_memmd(&set->set_md, stripes,
+ lov->desc.ld_pattern ?
+ lov->desc.ld_pattern : LOV_PATTERN_RAID0);
+ if (rc < 0)
+ GOTO(out_err, rc);
+ rc = 0;
+ newea = 1;
+ }
+ lsm = set->set_md;
+
+ lsm->lsm_object_id = src_oa->o_id;
+ if (!lsm->lsm_stripe_size)
+ lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
+ if (!lsm->lsm_pattern) {
+ LASSERT(lov->desc.ld_pattern);
+ lsm->lsm_pattern = lov->desc.ld_pattern;
+ }
+
+ stripes = alloc_idx_array(exp, lsm, newea, &idx_arr, &idx_cnt);
+ LASSERT(stripes <= lsm->lsm_stripe_count);
+ if (stripes <= 0)
+ GOTO(out_err, rc = stripes ? stripes : -EIO);
+
+ for (i = 0; i < stripes; i++) {
+ struct lov_request *req;
+ int ost_idx = idx_arr[i];
+ LASSERT(ost_idx >= 0);
+
OBD_ALLOC(req, sizeof(*req));
if (req == NULL)
- GOTO(out, rc = -ENOMEM);
+ GOTO(out_err, rc = -ENOMEM);
+ lov_set_add_req(req, set);
req->rq_buflen = sizeof(*req->rq_md);
OBD_ALLOC(req->rq_md, req->rq_buflen);
if (req->rq_md == NULL)
- GOTO(out, rc = -ENOMEM);
-
+ GOTO(out_err, rc = -ENOMEM);
+
req->rq_oa = obdo_alloc();
if (req->rq_oa == NULL)
- GOTO(out, rc = -ENOMEM);
-
+ GOTO(out_err, rc = -ENOMEM);
+
req->rq_idx = ost_idx;
req->rq_stripe = i;
/* create data objects with "parent" OA */
* stripe which holds the existing file size.
*/
if (src_oa->o_valid & OBD_MD_FLSIZE) {
- if (lov_stripe_offset(lsm, src_oa->o_size, i,
- &req->rq_oa->o_size) < 0 &&
- req->rq_oa->o_size)
- req->rq_oa->o_size--;
+ req->rq_oa->o_size =
+ lov_size_to_stripe(lsm, src_oa->o_size, i);
CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n",
i, req->rq_oa->o_size, src_oa->o_size);
}
- lov_set_add_req(req, set);
-
- /* If we have allocated enough objects, we are OK */
- if (set->set_count == lsm->lsm_stripe_count)
- GOTO(out, rc = 0);
}
+ LASSERT(set->set_count == stripes);
- if (set->set_count == 0)
- GOTO(out, rc = -EIO);
-
- /* If we were passed specific striping params, then a failure to
- * meet those requirements is an error, since we can't reallocate
- * that memory (it might be part of a larger array or something).
- *
- * We can only get here if lsm_stripe_count was originally > 1.
- */
- if (!newea) {
- CERROR("can't lstripe objid "LPX64": have %u want %u, rc %d\n",
- lsm->lsm_object_id, set->set_count,
- lsm->lsm_stripe_count, rc);
- rc = rc ? rc : -EFBIG;
- } else {
+ if (stripes < lsm->lsm_stripe_count)
qos_shrink_lsm(set);
- rc = 0;
+
+ if (oti && (src_oa->o_valid & OBD_MD_FLCOOKIE)) {
+ oti_alloc_cookies(oti, set->set_count);
+ if (!oti->oti_logcookies)
+ GOTO(out_err, rc = -ENOMEM);
+ set->set_cookies = oti->oti_logcookies;
}
-out:
- RETURN(rc);
+out_err:
+ if (newea && rc)
+ obd_free_memmd(exp, &set->set_md);
+ free_idx_array(idx_arr, idx_cnt);
+ EXIT;
+ return rc;
}
+
+/* An caveat here is don't use list_move() on same list */
+#define list_adjust(tgt, lov, list_name, value) \
+{ \
+ struct list_head *element; \
+ struct lov_tgt_desc *tmp; \
+ if (list_empty(&(tgt)->list_name)) \
+ list_add(&(tgt)->list_name, &(lov)->list_name); \
+ element = (tgt)->list_name.next; \
+ while((element != &(lov)->list_name) && \
+ (tmp = list_entry(element, struct lov_tgt_desc, list_name)) && \
+ (value(tgt) < value(tmp))) \
+ element = element->next; \
+ if (element != (tgt)->list_name.next) { \
+ list_del_init(&(tgt)->list_name); \
+ list_add(&(tgt)->list_name, element->prev); \
+ } \
+ element = (tgt)->list_name.prev; \
+ while ((element != &(lov)->list_name) && \
+ (tmp = list_entry(element, struct lov_tgt_desc, list_name)) && \
+ (value(tgt) > value(tmp))) \
+ element = element->prev; \
+ if (element != (tgt)->list_name.prev) { \
+ list_del_init(&(tgt)->list_name); \
+ list_add_tail(&(tgt)->list_name, element->prev); \
+ } \
+}
+
+void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs)
+{
+ struct lov_tgt_desc *tgt = &lov->tgts[idx];
+ __u64 bavail;
+ ENTRY;
+
+ bavail = osfs->os_bavail * osfs->os_bsize;
+ if (!bavail)
+ CWARN("ost %d has zero avail space!\n", idx);
+
+ CDEBUG(D_OTHER, "QOS: bfree now "LPU64"\n", bavail);
+
+ spin_lock(&lov->lov_lock);
+ list_adjust(tgt, lov, qos_bavail_list, TGT_BAVAIL);
+ spin_unlock(&lov->lov_lock);
+}
+
LASSERT(set->set_exp);
if (set == NULL)
RETURN(0);
- if (set->set_completes) {
+ if (set->set_completes)
rc = create_done(set->set_exp, set, lsmp);
- /* FIXME update qos data here */
- }
if (atomic_dec_and_test(&set->set_refcount))
lov_finish_set(set);
struct obdo *src_oa, struct obd_trans_info *oti,
struct lov_request_set **reqset)
{
- struct lov_obd *lov = &exp->exp_obd->u.lov;
struct lov_request_set *set;
- int rc = 0, newea = 0;
+ int rc = 0;
ENTRY;
OBD_ALLOC(set, sizeof(*set));
set->set_md = *lsmp;
set->set_oa = src_oa;
set->set_oti = oti;
-
- if (set->set_md == NULL) {
- int stripes, stripe_cnt;
- stripe_cnt = lov_get_stripecnt(lov, 0);
-
- /* If the MDS file was truncated up to some size, stripe over
- * enough OSTs to allow the file to be created at that size. */
- if (src_oa->o_valid & OBD_MD_FLSIZE) {
- stripes=((src_oa->o_size+LUSTRE_STRIPE_MAXBYTES)>>12)-1;
- do_div(stripes, (__u32)(LUSTRE_STRIPE_MAXBYTES >> 12));
-
- if (stripes > lov->desc.ld_active_tgt_count)
- GOTO(out_set, rc = -EFBIG);
- if (stripes < stripe_cnt)
- stripes = stripe_cnt;
- } else {
- stripes = stripe_cnt;
- }
-
- rc = lov_alloc_memmd(&set->set_md, stripes,
- lov->desc.ld_pattern ?
- lov->desc.ld_pattern : LOV_PATTERN_RAID0);
- if (rc < 0)
- goto out_set;
- newea = 1;
- }
-
- rc = qos_prep_create(lov, set, newea);
+
+ rc = qos_prep_create(exp, set);
if (rc)
- goto out_lsm;
-
- if (oti && (src_oa->o_valid & OBD_MD_FLCOOKIE)) {
- oti_alloc_cookies(oti, set->set_count);
- if (!oti->oti_logcookies)
- goto out_lsm;
- set->set_cookies = oti->oti_logcookies;
- }
- *reqset = set;
- RETURN(rc);
-
-out_lsm:
- if (*lsmp == NULL)
- obd_free_memmd(exp, &set->set_md);
-out_set:
- lov_fini_create_set(set, lsmp);
+ lov_fini_create_set(set, lsmp);
+ else
+ *reqset = set;
RETURN(rc);
-}
+}
static int common_attr_done(struct lov_request_set *set)
{
return snprintf(page, count, "%s\n", lov->desc.ld_uuid.uuid);
}
+static int lov_rd_qos_threshold(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *dev = (struct obd_device*) data;
+ struct lov_obd *lov;
+
+ LASSERT(dev != NULL);
+ lov = &dev->u.lov;
+ *eof = 1;
+ return snprintf(page, count, "%u MB\n", lov->desc.ld_qos_threshold);
+}
+
+static int lov_wr_qos_threshold(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *dev = (struct obd_device *)data;
+ struct lov_obd *lov;
+ int val, rc;
+ LASSERT(dev != NULL);
+
+ lov = &dev->u.lov;
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= 0)
+ return -EINVAL;
+ lov->desc.ld_qos_threshold = val;
+ return count;
+}
+
+static int lov_rd_qos_maxage(char *page, char **start, off_t off, int count,
+ int *eof, void *data)
+{
+ struct obd_device *dev = (struct obd_device*) data;
+ struct lov_obd *lov;
+
+ LASSERT(dev != NULL);
+ lov = &dev->u.lov;
+ *eof = 1;
+ return snprintf(page, count, "%u Sec\n", lov->desc.ld_qos_maxage);
+}
+
+static int lov_wr_qos_maxage(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *dev = (struct obd_device *)data;
+ struct lov_obd *lov;
+ int val, rc;
+ LASSERT(dev != NULL);
+
+ lov = &dev->u.lov;
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val <= 0)
+ return -EINVAL;
+ lov->desc.ld_qos_maxage = val;
+ return count;
+}
+
static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos)
{
struct obd_device *dev = p->private;
{ "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 },
{ "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 },
{ "desc_uuid", lov_rd_desc_uuid, 0, 0 },
+ { "qos_threshold",lov_rd_qos_threshold, lov_wr_qos_threshold, 0 },
+ { "qos_maxage", lov_rd_qos_maxage, lov_wr_qos_maxage, 0 },
{ 0 }
};
__swab64s (&ld->ld_default_stripe_size);
__swab64s (&ld->ld_default_stripe_offset);
__swab32s (&ld->ld_pattern);
+ __swab32s (&ld->ld_qos_threshold);
+ __swab32s (&ld->ld_qos_maxage);
/* uuid endian insensitive */
}
(long long)(int)offsetof(struct mds_body, aclsize));
LASSERTF((int)sizeof(((struct mds_body *)0)->aclsize) == 4, " found %lld\n",
(long long)(int)sizeof(((struct mds_body *)0)->aclsize));
+ LASSERTF((int)offsetof(struct mds_body, padding_2) == 156, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, padding_2));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->padding_2) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->padding_2));
+ LASSERTF((int)offsetof(struct mds_body, padding_3) == 160, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, padding_3));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->padding_3) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->padding_3));
+ LASSERTF((int)offsetof(struct mds_body, padding_4) == 164, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, padding_4));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->padding_4) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->padding_4));
LASSERTF(FMODE_READ == 1, " found %lld\n",
(long long)FMODE_READ);
LASSERTF(FMODE_WRITE == 2, " found %lld\n",
(long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n",
(long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+ LASSERTF((int)offsetof(struct lov_desc, ld_qos_threshold) == 32, " found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_qos_threshold));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_threshold) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_threshold));
+ LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, " found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, " found %lld\n",
(long long)(int)offsetof(struct lov_desc, ld_uuid));
LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, " found %lld\n",
--- /dev/null
+#!/bin/bash
+
+set -e
+
+export PATH=`dirname $0`/../utils:$PATH
+
+LFS=${LFS:-lfs}
+LCTL=${LCTL:-lctl}
+MOUNT=${MOUNT:-/mnt/lustre}
+MAXAGE=${MAXAGE:-1}
+
+QOSFILE=$MOUNT/qos_file
+TAB='--'
+
+echo "remove all files on $MOUNT..."
+rm -fr $MOUNT/*
+sleep 1 # to ensure we get up-to-date statfs info
+
+set_qos() {
+ for i in `ls /proc/fs/lustre/lov/*/qos_threshold`; do
+ echo $(($1/1024)) > $i
+ done
+ for i in `ls /proc/fs/lustre/lov/*/qos_maxage`; do
+ echo $2 > $i
+ done
+}
+
+# assume all osts has same free space
+OSTCOUNT=`cat /proc/fs/lustre/lov/*/activeobd | head -n 1`
+TOTALAVAIL=`cat /proc/fs/lustre/llite/*/kbytesavail | head -n 1`
+SINGLEAVAIL=$(($TOTALAVAIL/$OSTCOUNT))
+MINFREE=$((1024 * 4)) # 4M
+TOTALFFREE=`cat /proc/fs/lustre/llite/*/filesfree | head -n 1`
+
+if [ $SINGLEAVAIL -lt $MINFREE ]; then
+ echo "ERROR: single ost free size($SINGLEAVAIL kb) is too low!"
+ exit 1;
+fi
+if [ $OSTCOUNT -lt 3 ]; then
+ echo "WARN: ost count($OSTCOUNT) must be greater than 2!"
+ exit 0;
+fi
+
+qos_test_1() {
+ echo "[qos test 1]: creation skip almost full OST (avail space < threshold)"
+
+ # set qos_threshold as half ost size
+ THRESHOLD=$(($SINGLEAVAIL/2))
+ set_qos $THRESHOLD $MAXAGE
+
+ # set stripe number to 1
+ $LFS setstripe $QOSFILE 65536 -1 1
+ FULLOST=`$LFS find -q $QOSFILE | awk '/\s*\d*/ {print $1}'`
+
+ # floodfill the FULLOST
+ echo "$TAB fill the OST $FULLOST to almost fullness..."
+ dd if=/dev/zero of=$QOSFILE count=$(($SINGLEAVAIL - $THRESHOLD + 1500)) bs=1k > /dev/null 2>&1 || return 1
+ echo "$TAB done"
+
+ sleep $(($MAXAGE * 2))
+ echo "$TAB create 10 files with 1 stripe"
+ for i in `seq 10`; do
+ rm -f $MOUNT/file-$i
+ $LFS setstripe $MOUNT/file-$i 65536 -1 1
+ idx=`$LFS find -q $MOUNT/file-$i | awk '/\s*\d*/ {print $1}'`
+ if [ $idx -eq $FULLOST ]; then
+ echo "$TAB ERROR: create object on full OST $FULLOST"
+ return 1
+ fi
+ done
+ echo "$TAB no object created on OST $FULLOST"
+
+ # cleanup
+ for i in `seq 10`; do
+ rm -f $MOUNT/file-$i
+ done
+ rm -f $QOSFILE
+ # set threshold and maxage to normal value
+ set_qos 10240 1
+
+ sleep 1
+ return 0
+}
+
+qos_test_2 () {
+ echo "[qos test 2]: creation balancing over all OSTs by free space"
+
+ if [ $OSTCOUNT -lt 3 ]; then
+ echo "$TAB WARN: OST count < 3, test skipped"
+ return 0
+ fi
+
+ WADSZ=$(($SINGLEAVAIL * 3 / 4))
+ TOTALSZ=$(($WADSZ * $OSTCOUNT - 1))
+
+ # fill all OST 0 to 3/4 fulness
+ $LFS setstripe $QOSFILE 65536 0 1
+ echo "$TAB fill the OST 0 to 3/4 fulness..."
+ dd if=/dev/zero of=$QOSFILE count=$WADSZ bs=1k > /dev/null 2>&1 || return 1
+ echo "$TAB done"
+
+ # write 2 stripe files to fill up other OSTs
+ LOOPCNT=500
+ echo "$TAB create $LOOPCNT files with 2 stripe..."
+ for i in `seq $LOOPCNT`; do
+ rm -f $MOUNT/file-$i
+ $LFS setstripe $MOUNT/file-$i 65536 -1 2
+ done
+ echo "$TAB done"
+
+ # the objects created on OST 0 should be 1/4 of on other OSTs'
+ CNT0=`$LFS find -q /mnt/lustre | awk '/\s*\d*/ {print $1}'| grep -c 0`
+ CNT0=$(($CNT0 - 1))
+ echo "$TAB object created on OST 0: $CNT0"
+
+ # the object count of other osts must be greater than 2 times
+ CNT0=$(($CNT0 * 2))
+ for i in `seq $(($OSTCOUNT - 1))`; do
+ CNT=`$LFS find -q /mnt/lustre | awk '/\s*\d*/ {print $1}'| grep -c $i`
+ echo "$TAB object created on OST $i: $CNT"
+ if [ $CNT0 -gt $CNT ] ; then
+ echo "$TAB ERROR: too much objects created on OST 0"
+ return 1
+ fi
+ done
+ echo "$TAB objects created on OST 0 is about 1/4 of others'"
+
+ # cleanup
+ for i in `seq $LOOPCNT`; do
+ rm -f $MOUNT/file-$i
+ done
+ rm -f $QOSFILE
+ return 0
+}
+
+
+# run tests
+for j in `seq 2`; do
+ qos_test_$j
+ [ $? -ne 0 ] && exit 1
+done
+exit 0
ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a 42c 45 68"}
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
-[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 51b 51c 64b 71 101"
+[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 51b 51c 64b 71 75 101"
case `uname -r` in
2.4*) FSTYPE=${FSTYPE:-ext3} ;;
}
run_test 74 "ldlm_enqueue freed-export error path (shouldn't LBUG)"
+test_75() {
+ sh qos.sh
+}
+run_test 75 "qos test ============================================"
+
# on the LLNL clusters, runas will still pick up root's $TMP settings,
# which will not be writable for the runas user, and then you get a CVS
# error message with a corrupt path string (CVS bug) and panic.
"usage: add_conn <conn_uuid> [priority]\n"},
{"del_conn ", jt_lcfg_del_conn, 0,
"usage: del_conn <conn_uuid> \n"},
-
+
/* Llog operations */
{"llog_catlist", jt_llog_catlist, 0,
"list all catalog logs on current device.\n"
#include <linux/lustre_idl.h>
#include <linux/lustre_dlm.h>
#include <linux/obd.h> /* for struct lov_stripe_md */
+#include <linux/obd_lov.h>
#include <linux/lustre_build_version.h>
#include <unistd.h>
jt_cmdname(argv[0]), argv[5]);
return CMD_HELP;
}
+ desc.ld_qos_threshold = QOS_DEFAULT_THRESHOLD;
+ desc.ld_qos_maxage = QOS_DEFAULT_MAXAGE;
if (argc == 7) {
desc.ld_tgt_count = strtoul(argv[6], &end, 0);
CHECK_MEMBER(lov_desc, ld_pattern);
CHECK_MEMBER(lov_desc, ld_default_stripe_size);
CHECK_MEMBER(lov_desc, ld_default_stripe_offset);
+ CHECK_MEMBER(lov_desc, ld_qos_threshold);
+ CHECK_MEMBER(lov_desc, ld_qos_maxage);
CHECK_MEMBER(lov_desc, ld_uuid);
}
void lustre_assert_wire_constants(void)
{
/* Wire protocol assertions generated by 'wirecheck'
- * running on Linux mustang 2.6.12-1.1456_FC4smp #1 SMP Thu Sep 22 02:22:14 EDT 2005 i686 i68
- * with gcc version 4.0.1 20050727 (Red Hat 4.0.1-5) */
+ * running on Linux localhost.localdomain 2.6.9-1.667 #1 Tue Nov 2 14:41:25 EST 2004 i686 i68
+ * with gcc version 3.4.3 20050227 (Red Hat 3.4.3-22.fc3) */
/* Constants... */
(long long)(int)offsetof(struct mds_body, aclsize));
LASSERTF((int)sizeof(((struct mds_body *)0)->aclsize) == 4, " found %lld\n",
(long long)(int)sizeof(((struct mds_body *)0)->aclsize));
+ LASSERTF((int)offsetof(struct mds_body, padding_2) == 156, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, padding_2));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->padding_2) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->padding_2));
+ LASSERTF((int)offsetof(struct mds_body, padding_3) == 160, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, padding_3));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->padding_3) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->padding_3));
+ LASSERTF((int)offsetof(struct mds_body, padding_4) == 164, " found %lld\n",
+ (long long)(int)offsetof(struct mds_body, padding_4));
+ LASSERTF((int)sizeof(((struct mds_body *)0)->padding_4) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct mds_body *)0)->padding_4));
LASSERTF(FMODE_READ == 1, " found %lld\n",
(long long)FMODE_READ);
LASSERTF(FMODE_WRITE == 2, " found %lld\n",
(long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset));
LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n",
(long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset));
+ LASSERTF((int)offsetof(struct lov_desc, ld_qos_threshold) == 32, " found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_qos_threshold));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_threshold) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_threshold));
+ LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, " found %lld\n",
+ (long long)(int)offsetof(struct lov_desc, ld_qos_maxage));
+ LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, " found %lld\n",
+ (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage));
LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, " found %lld\n",
(long long)(int)offsetof(struct lov_desc, ld_uuid));
LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, " found %lld\n",