From c19aed1ebdc557fee8d8d8d03991f0f099397bed Mon Sep 17 00:00:00 2001 From: ericm Date: Wed, 1 Mar 2006 23:10:56 +0000 Subject: [PATCH] branch: b1_5 qos which previously landed on b1_5 was backed out during last merge from b1_4, found by Andreas. now put them back. --- lustre/include/lustre_idl.h | 4 +- lustre/include/lustre_lib.h | 7 + lustre/include/obd.h | 4 + lustre/lov/lov_internal.h | 4 +- lustre/lov/lov_obd.c | 32 ++- lustre/lov/lov_qos.c | 458 +++++++++++++++++++++++++++++++++++-------- lustre/lov/lov_request.c | 55 +----- lustre/lov/lproc_lov.c | 64 ++++++ lustre/ptlrpc/pack_generic.c | 30 ++- lustre/tests/sanity.sh | 7 +- lustre/utils/lustre_cfg.c | 2 + lustre/utils/wirecheck.c | 4 +- lustre/utils/wiretest.c | 28 ++- 13 files changed, 525 insertions(+), 174 deletions(-) diff --git a/lustre/include/lustre_idl.h b/lustre/include/lustre_idl.h index 2f85102..c1732f8 100644 --- a/lustre/include/lustre_idl.h +++ b/lustre/include/lustre_idl.h @@ -832,10 +832,10 @@ struct lov_desc { __u32 ld_pattern; /* PATTERN_RAID0, PATTERN_RAID1 */ __u64 ld_default_stripe_size; /* in bytes */ __u64 ld_default_stripe_offset; /* in bytes */ + __u32 ld_qos_threshold; /* in MB */ + __u32 ld_qos_maxage; /* in second */ __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */ __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */ - __u32 ld_padding_3; /* also fix lustre_swab_lov_desc */ - __u32 ld_padding_4; /* also fix lustre_swab_lov_desc */ struct obd_uuid ld_uuid; }; diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index 13ceaef..90a90d3 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -479,6 +479,13 @@ static inline void obd_ioctl_freedata(char *buf, int len) #define POISON_BULK 0 +static inline int ll_insecure_random_int(void) +{ + struct timeval t; + do_gettimeofday(&t); + return (int)(t.tv_usec); +} + /* * l_wait_event is a flexible sleeping function, permitting simple caller * configuration of interrupt and timeout sensitivity along with actions to diff --git a/lustre/include/obd.h b/lustre/include/obd.h index e66d174..425a657 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -453,6 +453,8 @@ struct lov_tgt_desc { struct obd_export *ltd_exp; unsigned int active:1, /* is this target up for requests */ reap:1; /* should this target be deleted */ + int index; /* index of target array in lov_obd */ + struct list_head qos_bavail_list; /* link entry to lov_obd */ }; struct lov_obd { @@ -464,6 +466,8 @@ struct lov_obd { int death_row; /* Do we have tgts scheduled to be deleted? (Make this a linked list?) */ unsigned int lo_catalog_loaded:1; + struct list_head qos_bavail_list; /* tgts list, sorted by available + space, protected by lov_lock */ struct lov_tgt_desc *tgts; }; diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 2af645e..5829fa9 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -132,8 +132,8 @@ int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off); /* lov_qos.c */ void qos_shrink_lsm(struct lov_request_set *set); -int qos_prep_create(struct lov_obd *lov, struct lov_request_set *set, - int newea); +int qos_prep_create(struct obd_export *exp, struct lov_request_set *set); +void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs); int qos_remedy_create(struct lov_request_set *set, struct lov_request *req); /* lov_request.c */ diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index d0cec17..58f6b27 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -454,9 +454,15 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) RETURN(-ENOMEM); } - memset(tgt, 0, bufsize); if (lov->tgts) { + int i; memcpy(tgt, lov->tgts, lov->bufsize); + LASSERT(index == lov->desc.ld_tgt_count); + for (i = 0; i < index; i++) { + INIT_LIST_HEAD(&tgt[i].qos_bavail_list); + list_splice(&lov->tgts[i].qos_bavail_list, + &tgt[i].qos_bavail_list); + } OBD_FREE(lov->tgts, lov->bufsize); } @@ -476,6 +482,8 @@ lov_add_obd(struct obd_device *obd, struct obd_uuid *uuidp, int index, int gen) tgt->uuid = *uuidp; /* XXX - add a sanity check on the generation number. */ tgt->ltd_gen = gen; + tgt->index = index; + INIT_LIST_HEAD(&tgt->qos_bavail_list); old_count = lov->desc.ld_tgt_count; if (index >= lov->desc.ld_tgt_count) @@ -600,7 +608,8 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) struct lustre_cfg *lcfg = buf; struct lov_desc *desc; struct lov_obd *lov = &obd->u.lov; - int count; + struct lov_tgt_desc *tgts; + int count, i; ENTRY; if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { @@ -665,12 +674,16 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) CERROR("Out of memory\n"); RETURN(-EINVAL); } - memset(lov->tgts, 0, lov->bufsize); + for (i = 0, tgts = lov->tgts; i < max(count, 1); i++, tgts++) { + tgts->index = i; + INIT_LIST_HEAD(&tgts->qos_bavail_list); + } desc->ld_active_tgt_count = 0; lov->desc = *desc; sema_init(&lov->lov_lock, 1); atomic_set(&lov->refcount, 0); + INIT_LIST_HEAD(&lov->qos_bavail_list); lprocfs_init_vars(lov, &lvars); lprocfs_obd_setup(obd, lvars.obd_vars); @@ -882,7 +895,9 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, { struct lov_obd *lov; struct lov_request_set *set = NULL; - struct list_head *pos; + struct obd_statfs osfs; + unsigned long maxage; + struct lov_request *req; int rc = 0; ENTRY; @@ -900,6 +915,9 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, if (!lov->desc.ld_active_tgt_count) RETURN(-EIO); + maxage = jiffies - lov->desc.ld_qos_maxage * HZ; + obd_statfs(exp->exp_obd, &osfs, maxage); + /* Recreate a specific object id at the given OST index */ if ((src_oa->o_valid & OBD_MD_FLFLAGS) && (src_oa->o_flags & OBD_FL_RECREATE_OBJS)) { @@ -911,10 +929,7 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, if (rc) RETURN(rc); - list_for_each (pos, &set->set_list) { - struct lov_request *req = - list_entry(pos, struct lov_request, rq_link); - + list_for_each_entry(req, &set->set_list, rq_link) { /* XXX: LOV STACKING: use real "obj_mdp" sub-data */ rc = obd_create(lov->tgts[req->rq_idx].ltd_exp, req->rq_oa, &req->rq_md, oti); @@ -1907,6 +1922,7 @@ static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs, rc = err; continue; } + qos_update(lov, i, &lov_sfs); if (!set) { memcpy(osfs, &lov_sfs, sizeof(lov_sfs)); diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index a3b9b42..bebbced 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -84,23 +84,21 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req) int stripe, i, rc = -EIO; ENTRY; - ost_idx = (req->rq_idx + lsm->lsm_stripe_count) % ost_count; + ost_idx = (req->rq_idx + 1) % ost_count; for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { if (lov->tgts[ost_idx].active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx); continue; } /* check if objects has been created on this ost */ - for (stripe = 0; stripe < lsm->lsm_stripe_count; stripe++) { - if (stripe == req->rq_stripe) - continue; + for (stripe = req->rq_stripe; stripe >= 0; stripe--) { if (ost_idx == lsm->lsm_oinfo[stripe].loi_ost_idx) break; } - if (stripe >= lsm->lsm_stripe_count) { + if (stripe < 0) { req->rq_idx = ost_idx; - rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa, + rc = obd_create(lov->tgts[ost_idx].ltd_exp, req->rq_oa, &req->rq_md, set->set_oti); if (!rc) break; @@ -111,73 +109,344 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req) #define LOV_CREATE_RESEED_MULT 4 #define LOV_CREATE_RESEED_MIN 1000 -/* FIXME use real qos data to prepare the lov create request */ -int qos_prep_create(struct lov_obd *lov, struct lov_request_set *set, int newea) +/* alloc objects on osts with round-robin algorithm */ +static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt) { - static int ost_start_idx, ost_start_count; + static int ost_start_count, ost_start_idx; unsigned ost_idx, ost_count = lov->desc.ld_tgt_count; unsigned ost_active_count = lov->desc.ld_active_tgt_count; - struct lov_stripe_md *lsm = set->set_md; - struct obdo *src_oa = set->set_oa; - int i, rc = 0; + int i, *idx_pos = idx_arr; ENTRY; - - LASSERT(src_oa->o_valid & OBD_MD_FLID); - - lsm->lsm_object_id = src_oa->o_id; - if (!lsm->lsm_stripe_size) - lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size; - if (!lsm->lsm_pattern) { - lsm->lsm_pattern = lov->desc.ld_pattern ? - lov->desc.ld_pattern : LOV_PATTERN_RAID0; + + if (--ost_start_count <= 0) { + ost_start_idx = ll_insecure_random_int(); + ost_start_count = + (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) + + LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U); + } else if (*stripe_cnt >= lov->desc.ld_active_tgt_count) { + /* If we allocate from all of the stripes, make the + * next file start on the next OST. */ + ++ost_start_idx; } + ost_idx = ost_start_idx % ost_count; - if (newea || lsm->lsm_oinfo[0].loi_ost_idx >= ost_count) { - if (--ost_start_count <= 0) { - ost_start_idx = ll_rand(); - ost_start_count = - (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) + - LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U); - } else if (lsm->lsm_stripe_count >= ost_active_count) { - /* If we allocate from all of the stripes, make the - * next file start on the next OST. */ - ++ost_start_idx; + for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { + ++ost_start_idx; + + if (lov->tgts[ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx); + continue; } - ost_idx = ost_start_idx % ost_count; - } else { - ost_idx = lsm->lsm_oinfo[0].loi_ost_idx; + + *idx_pos = ost_idx; + idx_pos++; + /* got enough ost */ + if (idx_pos - idx_arr == *stripe_cnt) + RETURN(0); } + *stripe_cnt = idx_pos - idx_arr; + RETURN(0); +} - CDEBUG(D_INODE, "allocating %d subobjs for objid "LPX64" at idx %d\n", - lsm->lsm_stripe_count, lsm->lsm_object_id, ost_idx); +/* alloc objects on osts with specific stripe offset */ +static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm, + int *idx_arr) +{ + unsigned ost_idx, ost_count = lov->desc.ld_tgt_count; + int i, *idx_pos = idx_arr; + ENTRY; + ost_idx = lsm->lsm_oinfo[0].loi_ost_idx; for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { - struct lov_request *req; - - ++ost_start_idx; if (lov->tgts[ost_idx].active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx); continue; } + *idx_pos = ost_idx; + idx_pos++; + /* got enough ost */ + if (idx_pos - idx_arr == lsm->lsm_stripe_count) + RETURN(0); + } + /* If we were passed specific striping params, then a failure to + * meet those requirements is an error, since we can't reallocate + * that memory (it might be part of a larger array or something). + * + * We can only get here if lsm_stripe_count was originally > 1. + */ + CERROR("can't lstripe objid "LPX64": have %u want %u\n", + lsm->lsm_object_id, idx_pos - idx_arr, lsm->lsm_stripe_count); + RETURN(-EFBIG); +} + +/* free space OST must have to be used for object allocation. */ +#define QOS_MIN (lov->desc.ld_qos_threshold << 20) + +#define TGT_BAVAIL(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_bavail * \ + tgt->ltd_exp->exp_obd->obd_osfs.os_bsize) +#define TGT_FFREE(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_ffree) + +/* alloc objects on osts with free space weighted algorithm */ +static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt) +{ + struct lov_obd *lov = &exp->exp_obd->u.lov; + unsigned ost_count = lov->desc.ld_tgt_count; + __u64 cur_bavail, rand, *availspace, total_bavail = 0; + int *indexes, nfound, good_osts, i, warn = 0, rc = 0; + struct lov_tgt_desc *tgt; + int shift, require_stripes = *stripe_cnt; + static time_t last_warn = 0; + time_t now = cfs_time_current_sec(); + ENTRY; + + availspace = NULL; + indexes = NULL; + OBD_ALLOC(availspace, sizeof(__u64) * ost_count); + OBD_ALLOC(indexes, sizeof(int) * require_stripes); + if (!availspace || !indexes) + GOTO(out_free, rc = -EAGAIN); + + mutex_down(&lov->lov_lock); + + /* if free space is below some threshold, just go + * to do round-robin allocation */ + total_bavail = (exp->exp_obd->obd_osfs.os_bavail * \ + exp->exp_obd->obd_osfs.os_bsize); + if (ost_count < 2 || total_bavail <= QOS_MIN) { + mutex_up(&lov->lov_lock); + GOTO(out_free, rc = -EAGAIN); + } + + /* if each ost has almost same free space, go to + * do rr allocation for better creation performance */ + if (!list_empty(&lov->qos_bavail_list)) { + __u64 max, min, val; + tgt = list_entry(lov->qos_bavail_list.next, + struct lov_tgt_desc, qos_bavail_list); + max = TGT_BAVAIL(tgt); + tgt = list_entry(lov->qos_bavail_list.prev, + struct lov_tgt_desc, qos_bavail_list); + min = TGT_BAVAIL(tgt); + + val = (max >= min) ? (max - min) : (min - max); + min = (min * 13) >> 8; /* less than 5% of gap */ + + if (val < min) { + mutex_up(&lov->lov_lock); + GOTO(out_free, rc = -EAGAIN); + } + } else { + mutex_up(&lov->lov_lock); + GOTO(out_free, rc = -EAGAIN); + } + + total_bavail = 0; + good_osts = 0; + /* warn zero available space/inode every 30 min */ + if (cfs_time_sub(now, last_warn) > 60 * 30) + warn = 1; + list_for_each_entry(tgt, &lov->qos_bavail_list, qos_bavail_list) { + if (!tgt->active) + continue; + if (!TGT_BAVAIL(tgt)) { + if (warn) { + CWARN("avail space on %s is zero\n", + tgt->uuid.uuid); + last_warn = now; + } + continue; + } + if (!TGT_FFREE(tgt)) { + if (warn) { + CWARN("free inode on %s is zero\n", + tgt->uuid.uuid); + last_warn = now; + } + continue; + } + if ((TGT_BAVAIL(tgt) <= QOS_MIN) && (good_osts >= *stripe_cnt)) + break; + availspace[good_osts] = TGT_BAVAIL(tgt); + indexes[good_osts] = tgt->index; + total_bavail += availspace[good_osts]; + good_osts++; + } + + mutex_up(&lov->lov_lock); + + if (!total_bavail) + GOTO(out_free, rc = -ENOSPC); + + /* if we don't have enough good OSTs, we reduce the stripe count. */ + if (good_osts < *stripe_cnt) + *stripe_cnt = good_osts; + + if (!*stripe_cnt) + GOTO(out_free, rc = -EAGAIN); + + nfound = shift = 0; + while ((total_bavail >> shift) > 0) + shift++; + shift++; + /* search enough OSTs with free space weighted random allocation */ + while (nfound < *stripe_cnt) { + cur_bavail = 0; + + get_random_bytes(&rand, sizeof(rand)); + if (shift < 64) + rand &= ((1 << shift) - 1); + while (rand > total_bavail) + rand -= total_bavail; + + for (i = 0; i < good_osts; i++) { + cur_bavail += availspace[i]; + if (cur_bavail >= rand) { + total_bavail -= availspace[i]; + availspace[i] = 0; + idx_arr[nfound] = indexes[i]; + nfound++; + break; + } + } + /* should never satisfy below condition */ + if (cur_bavail == 0) + break; + } + LASSERT(nfound == *stripe_cnt); + +out_free: + if (availspace) + OBD_FREE(availspace, sizeof(__u64) * ost_count); + if (indexes) + OBD_FREE(indexes, sizeof(int) * require_stripes); + if (rc != -EAGAIN) + RETURN(rc); + + rc = alloc_rr(lov, idx_arr, stripe_cnt); + RETURN(rc); +} + +/* return new alloced stripe count in success */ +static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm, + int newea, int **idx_arr, int *arr_cnt) +{ + struct lov_obd *lov = &exp->exp_obd->u.lov; + int stripe_cnt = lsm->lsm_stripe_count; + int i, rc = 0; + int *tmp_arr = NULL; + ENTRY; + + *arr_cnt = stripe_cnt; + OBD_ALLOC(tmp_arr, *arr_cnt * sizeof(int)); + if (tmp_arr == NULL) + RETURN(-ENOMEM); + for (i = 0; i < *arr_cnt; i++) + tmp_arr[i] = -1; + + if (newea || + lsm->lsm_oinfo[0].loi_ost_idx >= lov->desc.ld_tgt_count) + rc = alloc_qos(exp, tmp_arr, &stripe_cnt); + else + rc = alloc_specific(lov, lsm, tmp_arr); + + if (rc) + GOTO(out_arr, rc); + + *idx_arr = tmp_arr; + RETURN(stripe_cnt); +out_arr: + OBD_FREE(tmp_arr, *arr_cnt * sizeof(int)); + *arr_cnt = 0; + RETURN(rc); +} + +static void free_idx_array(int *idx_arr, int arr_cnt) +{ + if (arr_cnt) + OBD_FREE(idx_arr, arr_cnt * sizeof(int)); +} + +int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) +{ + struct lov_obd *lov = &exp->exp_obd->u.lov; + struct lov_stripe_md *lsm; + struct obdo *src_oa = set->set_oa; + struct obd_trans_info *oti = set->set_oti; + int i, stripes, rc = 0, newea = 0; + int *idx_arr, idx_cnt = 0; + ENTRY; + + LASSERT(src_oa->o_valid & OBD_MD_FLID); + + if (set->set_md == NULL) { + int stripe_cnt = lov_get_stripecnt(lov, 0); + + /* If the MDS file was truncated up to some size, stripe over + * enough OSTs to allow the file to be created at that size. */ + if (src_oa->o_valid & OBD_MD_FLSIZE) { + struct lov_tgt_desc *tgt; + stripes = 1; + + mutex_down(&lov->lov_lock); + list_for_each_entry(tgt, &lov->qos_bavail_list, + qos_bavail_list) { + if (!tgt->active) + continue; + if (TGT_BAVAIL(tgt) * stripes > src_oa->o_size) + break; + stripes++; + } + mutex_up(&lov->lov_lock); + + if (stripes < stripe_cnt) + stripes = stripe_cnt; + } else { + stripes = stripe_cnt; + } + rc = lov_alloc_memmd(&set->set_md, stripes, + lov->desc.ld_pattern ? + lov->desc.ld_pattern : LOV_PATTERN_RAID0, + LOV_MAGIC); + if (rc < 0) + GOTO(out_err, rc); + rc = 0; + newea = 1; + } + lsm = set->set_md; + + lsm->lsm_object_id = src_oa->o_id; + if (!lsm->lsm_stripe_size) + lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size; + if (!lsm->lsm_pattern) { + LASSERT(lov->desc.ld_pattern); + lsm->lsm_pattern = lov->desc.ld_pattern; + } + + stripes = alloc_idx_array(exp, lsm, newea, &idx_arr, &idx_cnt); + LASSERT(stripes <= lsm->lsm_stripe_count); + if (stripes <= 0) + GOTO(out_err, rc = stripes ? stripes : -EIO); + + for (i = 0; i < stripes; i++) { + struct lov_request *req; + int ost_idx = idx_arr[i]; + LASSERT(ost_idx >= 0); + OBD_ALLOC(req, sizeof(*req)); if (req == NULL) - GOTO(out, rc = -ENOMEM); + GOTO(out_err, rc = -ENOMEM); + lov_set_add_req(req, set); req->rq_buflen = sizeof(*req->rq_md); OBD_ALLOC(req->rq_md, req->rq_buflen); - if (req->rq_md == NULL) { - OBD_FREE_PTR(req); - GOTO(out, rc = -ENOMEM); - } - + if (req->rq_md == NULL) + GOTO(out_err, rc = -ENOMEM); + req->rq_oa = obdo_alloc(); - if (req->rq_oa == NULL) { - OBD_FREE_PTR(req->rq_md); - OBD_FREE_PTR(req); - GOTO(out, rc = -ENOMEM); - } - + if (req->rq_oa == NULL) + GOTO(out_err, rc = -ENOMEM); + req->rq_idx = ost_idx; req->rq_stripe = i; /* create data objects with "parent" OA */ @@ -188,41 +457,74 @@ int qos_prep_create(struct lov_obd *lov, struct lov_request_set *set, int newea) * stripe which holds the existing file size. */ if (src_oa->o_valid & OBD_MD_FLSIZE) { - if (lov_stripe_offset(lsm, src_oa->o_size, i, - &req->rq_oa->o_size) < 0 && - req->rq_oa->o_size) - req->rq_oa->o_size--; + req->rq_oa->o_size = + lov_size_to_stripe(lsm, src_oa->o_size, i); CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n", i, req->rq_oa->o_size, src_oa->o_size); } - lov_set_add_req(req, set); - - /* If we have allocated enough objects, we are OK */ - if (set->set_count == lsm->lsm_stripe_count) - GOTO(out, rc = 0); } + LASSERT(set->set_count == stripes); - if (set->set_count == 0) - GOTO(out, rc = -EIO); - - /* If we were passed specific striping params, then a failure to - * meet those requirements is an error, since we can't reallocate - * that memory (it might be part of a larger array or something). - * - * We can only get here if lsm_stripe_count was originally > 1. - */ - if (!newea) { - CERROR("can't lstripe objid "LPX64": have %u want %u, rc %d\n", - lsm->lsm_object_id, set->set_count, - lsm->lsm_stripe_count, rc); - rc = rc ? rc : -EFBIG; - } else { + if (stripes < lsm->lsm_stripe_count) qos_shrink_lsm(set); - rc = 0; + + if (oti && (src_oa->o_valid & OBD_MD_FLCOOKIE)) { + oti_alloc_cookies(oti, set->set_count); + if (!oti->oti_logcookies) + GOTO(out_err, rc = -ENOMEM); + set->set_cookies = oti->oti_logcookies; } -out: +out_err: + if (newea && rc) + obd_free_memmd(exp, &set->set_md); + free_idx_array(idx_arr, idx_cnt); + EXIT; + return rc; +} - RETURN(rc); +/* An caveat here is don't use list_move() on same list */ +#define list_adjust(tgt, lov, list_name, value) \ +{ \ + struct list_head *element; \ + struct lov_tgt_desc *tmp; \ + if (list_empty(&(tgt)->list_name)) \ + list_add(&(tgt)->list_name, &(lov)->list_name); \ + element = (tgt)->list_name.next; \ + while((element != &(lov)->list_name) && \ + (tmp = list_entry(element, struct lov_tgt_desc, list_name)) && \ + (value(tgt) < value(tmp))) \ + element = element->next; \ + if (element != (tgt)->list_name.next) { \ + list_del_init(&(tgt)->list_name); \ + list_add(&(tgt)->list_name, element->prev); \ + } \ + element = (tgt)->list_name.prev; \ + while ((element != &(lov)->list_name) && \ + (tmp = list_entry(element, struct lov_tgt_desc, list_name)) && \ + (value(tgt) > value(tmp))) \ + element = element->prev; \ + if (element != (tgt)->list_name.prev) { \ + list_del_init(&(tgt)->list_name); \ + list_add_tail(&(tgt)->list_name, element->prev); \ + } \ } + +void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs) +{ + struct lov_tgt_desc *tgt = &lov->tgts[idx]; + __u64 bavail; + ENTRY; + + bavail = osfs->os_bavail * osfs->os_bsize; + if (!bavail) + CWARN("ost %d has zero avail space!\n", idx); + + CDEBUG(D_OTHER, "QOS: bfree now "LPU64"\n", bavail); + + mutex_down(&lov->lov_lock); + list_adjust(tgt, lov, qos_bavail_list, TGT_BAVAIL); + mutex_up(&lov->lov_lock); +} + diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index 4403eda..291bad5 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -591,10 +591,8 @@ int lov_fini_create_set(struct lov_request_set *set,struct lov_stripe_md **lsmp) if (set == NULL) RETURN(0); LASSERT(set->set_exp); - if (set->set_completes) { + if (set->set_completes) rc = create_done(set->set_exp, set, lsmp); - /* FIXME update qos data here */ - } if (atomic_dec_and_test(&set->set_refcount)) lov_finish_set(set); @@ -649,9 +647,8 @@ int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **lsmp, struct obdo *src_oa, struct obd_trans_info *oti, struct lov_request_set **reqset) { - struct lov_obd *lov = &exp->exp_obd->u.lov; struct lov_request_set *set; - int rc = 0, newea = 0; + int rc = 0; ENTRY; OBD_ALLOC(set, sizeof(*set)); @@ -664,51 +661,11 @@ int lov_prep_create_set(struct obd_export *exp, struct lov_stripe_md **lsmp, set->set_oa = src_oa; set->set_oti = oti; - if (set->set_md == NULL) { - int stripes, stripe_cnt; - stripe_cnt = lov_get_stripecnt(lov, 0); - - /* If the MDS file was truncated up to some size, stripe over - * enough OSTs to allow the file to be created at that size. */ - if (src_oa->o_valid & OBD_MD_FLSIZE) { - stripes=((src_oa->o_size+LUSTRE_STRIPE_MAXBYTES)>>12)-1; - do_div(stripes, (__u32)(LUSTRE_STRIPE_MAXBYTES >> 12)); - - if (stripes > lov->desc.ld_active_tgt_count) - GOTO(out_set, rc = -EFBIG); - if (stripes < stripe_cnt) - stripes = stripe_cnt; - } else { - stripes = stripe_cnt; - } - - rc = lov_alloc_memmd(&set->set_md, stripes, - lov->desc.ld_pattern ? - lov->desc.ld_pattern : LOV_PATTERN_RAID0, - LOV_MAGIC); - if (rc < 0) - goto out_set; - newea = 1; - } - - rc = qos_prep_create(lov, set, newea); + rc = qos_prep_create(exp, set); if (rc) - goto out_lsm; - - if (oti && (src_oa->o_valid & OBD_MD_FLCOOKIE)) { - oti_alloc_cookies(oti, set->set_count); - if (!oti->oti_logcookies) - goto out_lsm; - set->set_cookies = oti->oti_logcookies; - } - *reqset = set; - RETURN(rc); - -out_lsm: - if (*lsmp == NULL) - obd_free_memmd(exp, &set->set_md); -out_set: - lov_fini_create_set(set, lsmp); + lov_fini_create_set(set, lsmp); + else + *reqset = set; RETURN(rc); } diff --git a/lustre/lov/lproc_lov.c b/lustre/lov/lproc_lov.c index 5fc85fe..5ae9f62 100644 --- a/lustre/lov/lproc_lov.c +++ b/lustre/lov/lproc_lov.c @@ -118,6 +118,68 @@ static int lov_rd_desc_uuid(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n", lov->desc.ld_uuid.uuid); } +static int lov_rd_qos_threshold(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*) data; + struct lov_obd *lov; + + LASSERT(dev != NULL); + lov = &dev->u.lov; + *eof = 1; + return snprintf(page, count, "%u MB\n", lov->desc.ld_qos_threshold); +} + +static int lov_wr_qos_threshold(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_obd *lov; + int val, rc; + LASSERT(dev != NULL); + + lov = &dev->u.lov; + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val <= 0) + return -EINVAL; + lov->desc.ld_qos_threshold = val; + return count; +} + +static int lov_rd_qos_maxage(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*) data; + struct lov_obd *lov; + + LASSERT(dev != NULL); + lov = &dev->u.lov; + *eof = 1; + return snprintf(page, count, "%u Sec\n", lov->desc.ld_qos_maxage); +} + +static int lov_wr_qos_maxage(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *dev = (struct obd_device *)data; + struct lov_obd *lov; + int val, rc; + LASSERT(dev != NULL); + + lov = &dev->u.lov; + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val <= 0) + return -EINVAL; + lov->desc.ld_qos_maxage = val; + return count; +} + static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos) { struct obd_device *dev = p->private; @@ -188,6 +250,8 @@ struct lprocfs_vars lprocfs_obd_vars[] = { { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, { "desc_uuid", lov_rd_desc_uuid, 0, 0 }, + { "qos_threshold",lov_rd_qos_threshold, lov_wr_qos_threshold, 0 }, + { "qos_maxage", lov_rd_qos_maxage, lov_wr_qos_maxage, 0 }, { 0 } }; diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 774021b..820fb0b 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -765,6 +765,8 @@ void lustre_swab_lov_desc (struct lov_desc *ld) __swab64s (&ld->ld_default_stripe_size); __swab64s (&ld->ld_default_stripe_offset); __swab32s (&ld->ld_pattern); + __swab32s (&ld->ld_qos_threshold); + __swab32s (&ld->ld_qos_maxage); /* uuid endian insensitive */ } @@ -935,8 +937,8 @@ void lustre_swab_qdata(struct qunit_data *d) void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' - * running on Linux schatzie.adilger.int 2.6.12-1.1381_FC3 #1 Fri Oct 21 03:46:55 EDT 2005 i6 - * with gcc version 3.3.4 20040817 (Red Hat Linux 3.3.4-2) */ + * running on Linux vmbuild 2.6.9-build #8 Sun Dec 18 16:30:50 MST 2005 i686 i686 i386 GNU/Li + * with gcc version 3.4.2 20041017 (Red Hat 3.4.2-6.fc3) */ /* Constants... */ @@ -2069,26 +2071,22 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); - LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, " found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 32, " found %lld\n", + LASSERTF((int)offsetof(struct lov_desc, ld_qos_threshold) == 32, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_qos_threshold)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_threshold) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_threshold)); + LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_padding_1)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 36, " found %lld\n", + LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_padding_2)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_3) == 40, " found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_padding_3)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_3) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_3)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_4) == 44, " found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_padding_4)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_4) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_4)); LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_uuid)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, " found %lld\n", diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 9f9c63a..a32f28e 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -11,7 +11,7 @@ ONLY=${ONLY:-"$*"} ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a 42b 42c 42d 45 68"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 27m 51b 51c 63 64b 71 101" +[ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 24o 27m 51b 51c 63 64b 71 77 101" case `uname -r` in 2.4*) FSTYPE=${FSTYPE:-ext3}; ALWAYS_EXCEPT="$ALWAYS_EXCEPT 76" ;; @@ -2707,6 +2707,11 @@ test_76() { # bug 1443 } run_test 76 "destroy duplicate inodes in client inode cache" +test_77() { + sh qos.sh +} +run_test 77 "qos test ============================================" + # on the LLNL clusters, runas will still pick up root's $TMP settings, # which will not be writable for the runas user, and then you get a CVS # error message with a corrupt path string (CVS bug) and panic. diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c index 3c5544e..cb717b5 100644 --- a/lustre/utils/lustre_cfg.c +++ b/lustre/utils/lustre_cfg.c @@ -398,6 +398,8 @@ int jt_lcfg_lov_setup(int argc, char **argv) jt_cmdname(argv[0]), argv[5]); return CMD_HELP; } + desc.ld_qos_threshold = QOS_DEFAULT_THRESHOLD; + desc.ld_qos_maxage = QOS_DEFAULT_MAXAGE; if (argc == 7) { desc.ld_tgt_count = strtoul(argv[6], &end, 0); diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 2b56b83..bb189c5 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -494,10 +494,10 @@ check_lov_desc(void) CHECK_MEMBER(lov_desc, ld_pattern); CHECK_MEMBER(lov_desc, ld_default_stripe_size); CHECK_MEMBER(lov_desc, ld_default_stripe_offset); + CHECK_MEMBER(lov_desc, ld_qos_threshold); + CHECK_MEMBER(lov_desc, ld_qos_maxage); CHECK_MEMBER(lov_desc, ld_padding_1); CHECK_MEMBER(lov_desc, ld_padding_2); - CHECK_MEMBER(lov_desc, ld_padding_3); - CHECK_MEMBER(lov_desc, ld_padding_4); CHECK_MEMBER(lov_desc, ld_uuid); } diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 5845795..5149d7f 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -26,8 +26,8 @@ int main() void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' - * running on Linux schatzie.adilger.int 2.6.12-1.1381_FC3 #1 Fri Oct 21 03:46:55 EDT 2005 i6 - * with gcc version 3.3.4 20040817 (Red Hat Linux 3.3.4-2) */ + * running on Linux vmbuild 2.6.9-build #8 Sun Dec 18 16:30:50 MST 2005 i686 i686 i386 GNU/Li + * with gcc version 3.4.2 20041017 (Red Hat 3.4.2-6.fc3) */ /* Constants... */ @@ -1160,26 +1160,22 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); - LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, " found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 32, " found %lld\n", + LASSERTF((int)offsetof(struct lov_desc, ld_qos_threshold) == 32, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_qos_threshold)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_threshold) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_threshold)); + LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, " found %lld\n", + (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); + LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage)); + LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_padding_1)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 36, " found %lld\n", + LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_padding_2)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_3) == 40, " found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_padding_3)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_3) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_3)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_4) == 44, " found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_padding_4)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_4) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_4)); LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_uuid)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, " found %lld\n", -- 1.8.3.1