From d574c18bb71327fe57ea1dec1e892d3cd83a438a Mon Sep 17 00:00:00 2001 From: nathan Date: Tue, 9 May 2006 18:17:02 +0000 Subject: [PATCH] Branch b1_5 b=9862 r=adilger(pending) Optimized stripe assigment, based on: 1. space remaing on each OST ("QOS" from b1_5) 2. OSS optimization, where we try to improve network usage by distributing stripes between OSS's (nodes) rather than just OSTs. (For rr and qos assignments). --- lustre/include/lustre/lustre_idl.h | 5 +- lustre/include/obd.h | 67 +++-- lustre/llite/dir.c | 2 +- lustre/llite/llite_lib.c | 2 +- lustre/lov/lov_internal.h | 4 + lustre/lov/lov_log.c | 8 +- lustre/lov/lov_obd.c | 196 ++++++------- lustre/lov/lov_qos.c | 575 +++++++++++++++++++++++++++---------- lustre/lov/lov_request.c | 28 +- lustre/lov/lproc_lov.c | 25 +- lustre/ptlrpc/pack_generic.c | 5 - lustre/quota/quota_check.c | 4 +- lustre/quota/quota_ctl.c | 4 +- lustre/utils/lustre_cfg.c | 1 - lustre/utils/wirecheck.c | 1 - lustre/utils/wiretest.c | 4 - 16 files changed, 625 insertions(+), 306 deletions(-) diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index f99f3ff..80d0ef1 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -838,14 +838,15 @@ extern void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn); #define LOV_DESC_MAGIC 0xB0CCDE5C +/* LOV settings descriptor (should only contain static info) */ struct lov_desc { __u32 ld_tgt_count; /* how many OBD's */ __u32 ld_active_tgt_count; /* how many active */ __u32 ld_default_stripe_count; /* how many objects are used */ - __u32 ld_pattern; /* PATTERN_RAID0, PATTERN_RAID1 */ + __u32 ld_pattern; /* default PATTERN_RAID0 */ __u64 ld_default_stripe_size; /* in bytes */ __u64 ld_default_stripe_offset; /* in bytes */ - __u32 ld_qos_threshold; /* in MB */ + __u32 ld_padding_0; /* unused */ __u32 ld_qos_maxage; /* in second */ __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */ __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */ diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 751542c..16ea176 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -467,28 +467,59 @@ struct echo_client_obd { __u64 ec_unique; }; +struct lov_qos_oss { + struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */ + struct list_head lqo_oss_list; /* link to lov_qos */ + __u32 lqo_ost_count; /* number of osts on this oss */ + __u64 lqo_bavail; /* total bytes avail on OSS */ + __u64 lqo_penalty; /* current penalty */ + __u64 lqo_penalty_per_obj; /* penalty decrease every obj*/ +}; + +struct ltd_qos { + struct lov_qos_oss *ltq_oss; /* oss info */ + __u64 ltq_penalty; /* current penalty */ + __u64 ltq_penalty_per_obj; /* penalty decrease every obj*/ + __u64 ltq_weight; /* net weighting */ + unsigned int ltq_usable:1; /* usable for striping */ +}; + +struct lov_qos { + struct list_head lq_oss_list; /* list of OSSs that targets use */ + struct rw_semaphore lq_rw_sem; + __u32 lq_active_oss_count; + __u32 *lq_rr_array; /* round-robin optimized list */ + unsigned int lq_rr_size; /* rr array size */ + unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_dirty:1, /* recalc qos data */ + lq_dirty_rr:1, /* recalc round-robin list */ + lq_same_space:1,/* the ost's all have approx. + the same space avail */ + lq_reset:1; /* zero current penalties */ +}; + struct lov_tgt_desc { - struct obd_uuid uuid; - __u32 ltd_gen; - struct obd_export *ltd_exp; - unsigned int active:1, /* is this target up for requests */ - reap:1; /* should this target be deleted */ - int index; /* index of target array in lov_obd */ - struct list_head qos_bavail_list; /* link entry to lov_obd */ + struct obd_uuid ltd_uuid; + struct obd_export *ltd_exp; + struct ltd_qos ltd_qos; /* qos info per target */ + __u32 ltd_gen; + __u32 ltd_index; /* index in lov_obd->tgts */ + unsigned int ltd_active:1,/* is this target up for requests */ + ltd_reap:1; /* should this target be deleted */ }; struct lov_obd { - struct semaphore lov_lock; - atomic_t refcount; - struct lov_desc desc; - struct obd_connect_data ocd; - int bufsize; - int connects; - int death_row; /* Do we have tgts scheduled to be deleted? - (Make this a linked list?) */ - struct list_head qos_bavail_list; /* tgts list, sorted by available - space, protected by lov_lock */ - struct lov_tgt_desc *tgts; + struct lov_desc desc; + struct lov_tgt_desc *tgts; + struct semaphore lov_lock; + struct obd_connect_data lov_ocd; + struct lov_qos lov_qos; /* qos info per lov */ + atomic_t lov_refcount; + __u32 lov_tgt_count; /* how many OBD's */ + __u32 lov_active_tgt_count; /* how many active */ + int lov_connects; + int lov_death_row;/* tgts scheduled to be deleted */ + int lov_tgt_bufsize; /* size of tgts */ }; struct niobuf_local { diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 9c1588b..2d4a78e 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -864,7 +864,7 @@ out_free_memmd: for (i = 0; i < lov->desc.ld_tgt_count; i++) { exp = lov->tgts[i].ltd_exp; - if (!lov->tgts[i].active) + if (!lov->tgts[i].ltd_active) continue; if (exp->exp_obd == obd) { diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 73c1631..b1d892c 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -1656,7 +1656,7 @@ int ll_obd_statfs(struct inode *inode, void *arg) GOTO(out_statfs, rc = -ENODEV); client_obd = class_exp2obd(lov->tgts[index].ltd_exp); - if (!lov->tgts[index].active) + if (!lov->tgts[index].ltd_active) GOTO(out_uuid, rc = -ENODATA); } diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 5829fa9..fecd1b5 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -131,6 +131,8 @@ int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off); /* lov_qos.c */ +int qos_add_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt); +int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt); void qos_shrink_lsm(struct lov_request_set *set); int qos_prep_create(struct obd_export *exp, struct lov_request_set *set); void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs); @@ -200,6 +202,8 @@ int lov_fini_cancel_set(struct lov_request_set *set); /* lov_obd.c */ int lov_get_stripecnt(struct lov_obd *lov, int stripe_count); +void lov_getref(struct obd_device *obd); +void lov_putref(struct obd_device *obd); /* lov_log.c */ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt, diff --git a/lustre/lov/lov_log.c b/lustre/lov/lov_log.c index 454b5a6..34fd545 100644 --- a/lustre/lov/lov_log.c +++ b/lustre/lov/lov_log.c @@ -116,11 +116,11 @@ static int lov_llog_origin_connect(struct llog_ctxt *ctxt, int count, struct obd_device *child; struct llog_ctxt *cctxt; - if (!tgt->active) + if (!tgt->ltd_active) continue; child = tgt->ltd_exp->exp_obd; cctxt = llog_get_context(child, ctxt->loc_idx); - if (uuid && !obd_uuid_equals(uuid, &lov->tgts[i].uuid)) + if (uuid && !obd_uuid_equals(uuid, &lov->tgts[i].ltd_uuid)) continue; rc = llog_connect(cctxt, 1, logid, gen, uuid); @@ -154,7 +154,7 @@ static int lov_llog_repl_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *ls int err; err = llog_cancel(cctxt, NULL, 1, cookies, flags); - if (err && lov->tgts[loi->loi_ost_idx].active) { + if (err && lov->tgts[loi->loi_ost_idx].ltd_active) { CERROR("error: objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, loi->loi_id, loi->loi_ost_idx, err); @@ -196,7 +196,7 @@ int lov_llog_init(struct obd_device *obd, struct obd_device *tgt, LASSERT(lov->desc.ld_tgt_count == count); for (i = 0, ctgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, ctgt++) { struct obd_device *child; - if (!ctgt->active) + if (!ctgt->ltd_active) continue; child = ctgt->ltd_exp->exp_obd; rc = obd_llog_init(child, tgt, 1, logid + i); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index cf86943..55cda02 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -54,35 +54,36 @@ /* FIXME add lov_get/putrefs around every access to lov->tgts for on-line non- quiescent ost removal */ /* Keep a refcount of lov->tgt usage to prevent racing with deletion */ -static void lov_getref(struct obd_device *obd) +void lov_getref(struct obd_device *obd) { struct lov_obd *lov = &obd->u.lov; /* nobody gets through here until lov_putref is done */ mutex_down(&lov->lov_lock); - atomic_inc(&lov->refcount); + atomic_inc(&lov->lov_refcount); mutex_up(&lov->lov_lock); return; } static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt); -static void lov_putref(struct obd_device *obd) +void lov_putref(struct obd_device *obd) { struct lov_obd *lov = &obd->u.lov; mutex_down(&lov->lov_lock); /* ok to dec to 0 more than once -- ltd_exp's will be null */ - if (atomic_dec_and_test(&lov->refcount) && lov->death_row) { + if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) { struct lov_tgt_desc *tgt; int i; - CDEBUG(D_CONFIG, "destroying %d lov targets\n", lov->death_row); + CDEBUG(D_CONFIG, "destroying %d lov targets\n", + lov->lov_death_row); for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { - if (!tgt->reap) + if (!tgt->ltd_reap) continue; /* Disconnect and delete from list */ __lov_del_obd(obd, tgt); - lov->death_row--; + lov->lov_death_row--; } } mutex_up(&lov->lov_lock); @@ -93,7 +94,7 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt, int activate, struct obd_connect_data *data) { struct lov_obd *lov = &obd->u.lov; - struct obd_uuid *tgt_uuid = &tgt->uuid; + struct obd_uuid *tgt_uuid = &tgt->ltd_uuid; struct obd_device *tgt_obd; struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; struct lustre_handle conn = {0, }; @@ -161,10 +162,14 @@ static int lov_connect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt, RETURN(rc); } - tgt->active = 1; - tgt->reap = 0; + tgt->ltd_active = 1; + tgt->ltd_reap = 0; lov->desc.ld_active_tgt_count++; + rc = qos_add_tgt(obd, tgt); + if (rc) + CERROR("qos_add_tgt failed %d\n", rc); + #ifdef __KERNEL__ lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); if (lov_proc_dir) { @@ -201,14 +206,14 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, int rc; ENTRY; - lov->ocd.ocd_connect_flags = OBD_CONNECT_EMPTY; + lov->lov_ocd.ocd_connect_flags = OBD_CONNECT_EMPTY; if (data) - lov->ocd = *data; + lov->lov_ocd = *data; rc = class_connect(conn, obd, cluuid); if (!rc) - lov->connects++; - CDEBUG(D_CONFIG, "connect #%d\n", lov->connects); + lov->lov_connects++; + CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects); /* target connects are done in lov_add_target */ @@ -226,6 +231,11 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", obd->obd_name, osc_obd->obd_name); + if (tgt->ltd_active) { + tgt->ltd_active = 0; + lov->desc.ld_active_tgt_count--; + } + lov_proc_dir = lprocfs_srch(obd->obd_proc_entry, "target_obds"); if (lov_proc_dir) { cfs_proc_dir_entry_t *osc_symlink; @@ -253,17 +263,14 @@ static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) rc = obd_disconnect(tgt->ltd_exp); if (rc) { - if (tgt->active) { + if (tgt->ltd_active) { CERROR("Target %s disconnect error %d\n", - tgt->uuid.uuid, rc); + obd_uuid2str(&tgt->ltd_uuid), rc); } rc = 0; } - if (tgt->active) { - tgt->active = 0; - lov->desc.ld_active_tgt_count--; - } + qos_del_tgt(obd, tgt); tgt->ltd_exp = NULL; RETURN(0); @@ -284,10 +291,10 @@ static int lov_disconnect(struct obd_export *exp) goto out; /* Only disconnect the underlying layers on the final disconnect. */ - lov->connects--; - if (lov->connects != 0) { + lov->lov_connects--; + if (lov->lov_connects != 0) { /* why should there be more than 1 connect? */ - CERROR("disconnect #%d\n", lov->connects); + CERROR("disconnect #%d\n", lov->lov_connects); goto out; } @@ -297,7 +304,7 @@ static int lov_disconnect(struct obd_export *exp) for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { if (tgt->ltd_exp) { /* Disconnection is the last we know about an obd */ - lov_del_target(obd, &tgt->uuid, i, tgt->ltd_gen); + lov_del_target(obd, &tgt->ltd_uuid, i, tgt->ltd_gen); } } lov_putref(obd); @@ -328,24 +335,25 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, continue; CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n", - i, tgt->uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie); - if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof uuid->uuid) == 0) + i, obd_uuid2str(&tgt->ltd_uuid), + tgt->ltd_exp->exp_handle.h_cookie); + if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) break; } if (i == lov->desc.ld_tgt_count) GOTO(out, rc = -EINVAL); - if (tgt->active == activate) { + if (tgt->ltd_active == activate) { CDEBUG(D_INFO, "OSC %s already %sactive!\n", uuid->uuid, activate ? "" : "in"); GOTO(out, rc); } - CDEBUG(D_INFO, "Marking OSC %s %sactive\n", uuid->uuid, + CDEBUG(D_INFO, "Marking OSC %s %sactive\n", obd_uuid2str(uuid), activate ? "" : "in"); - tgt->active = activate; + tgt->ltd_active = activate; if (activate) lov->desc.ld_active_tgt_count++; else @@ -401,7 +409,7 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, int i; for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { - if (obd_uuid_empty(&tgt->uuid)) + if (obd_uuid_empty(&tgt->ltd_uuid)) continue; tgt_obd = class_exp2obd(tgt->ltd_exp); rc = obd_notify_observer(obd, tgt_obd, ev, data); @@ -444,7 +452,7 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, } bufsize = sizeof(struct lov_tgt_desc) * (index + 1); - if (bufsize > lov->bufsize) { + if (bufsize > lov->lov_tgt_bufsize) { OBD_ALLOC(tgt, bufsize); if (tgt == NULL) { CERROR("couldn't allocate %d bytes for new table.\n", @@ -453,35 +461,27 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, } if (lov->tgts) { - int i; - memcpy(tgt, lov->tgts, lov->bufsize); - LASSERT(index == lov->desc.ld_tgt_count); - for (i = 0; i < index; i++) { - INIT_LIST_HEAD(&tgt[i].qos_bavail_list); - list_splice(&lov->tgts[i].qos_bavail_list, - &tgt[i].qos_bavail_list); - } - OBD_FREE(lov->tgts, lov->bufsize); + memcpy(tgt, lov->tgts, lov->lov_tgt_bufsize); + OBD_FREE(lov->tgts, lov->lov_tgt_bufsize); } lov->tgts = tgt; - lov->bufsize = bufsize; + lov->lov_tgt_bufsize = bufsize; CDEBUG(D_CONFIG, "tgts: %p bufsize: %d\n", - lov->tgts, lov->bufsize); + lov->tgts, lov->lov_tgt_bufsize); } tgt = &lov->tgts[index]; - if (!obd_uuid_empty(&tgt->uuid)) { + if (!obd_uuid_empty(&tgt->ltd_uuid)) { CERROR("UUID %s already assigned at LOV target index %d\n", - obd_uuid2str(&tgt->uuid), index); + obd_uuid2str(&tgt->ltd_uuid), index); RETURN(-EEXIST); } - tgt->uuid = *uuidp; + tgt->ltd_uuid = *uuidp; /* XXX - add a sanity check on the generation number. */ tgt->ltd_gen = gen; - tgt->index = index; - INIT_LIST_HEAD(&tgt->qos_bavail_list); + tgt->ltd_index = index; if (index >= lov->desc.ld_tgt_count) lov->desc.ld_tgt_count = index + 1; @@ -497,16 +497,16 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, osc_obd->obd_no_recov = 0; } - if (lov->ocd.ocd_connect_flags != OBD_CONNECT_EMPTY) { + if (lov->lov_ocd.ocd_connect_flags != OBD_CONNECT_EMPTY) { /* Keep the original connect flags pristine */ - OBD_ALLOC(ocd, sizeof(*ocd)); + OBD_ALLOC_PTR(ocd); if (!ocd) RETURN(-ENOMEM); - *ocd = lov->ocd; + *ocd = lov->lov_ocd; } rc = lov_connect_obd(obd, tgt, active, ocd); if (ocd) - OBD_FREE(ocd, sizeof(*ocd)); + OBD_FREE_PTR(ocd); if (rc) GOTO(out, rc); @@ -518,8 +518,8 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, out: if (rc) { CERROR("add failed (%d), deleting %s\n", rc, - (char *)tgt->uuid.uuid); - lov_del_target(obd, &tgt->uuid, index, 0); + obd_uuid2str(&tgt->ltd_uuid)); + lov_del_target(obd, &tgt->ltd_uuid, index, 0); } RETURN(rc); } @@ -542,23 +542,25 @@ static int lov_del_target(struct obd_device *obd, struct obd_uuid *uuidp, tgt = &lov->tgts[index]; - if (obd_uuid_empty(&tgt->uuid)) { + if (obd_uuid_empty(&tgt->ltd_uuid)) { CERROR("LOV target at index %d is not setup.\n", index); RETURN(-EINVAL); } - if (!obd_uuid_equals(uuidp, &tgt->uuid)) { + if (!obd_uuid_equals(uuidp, &tgt->ltd_uuid)) { CERROR("LOV target UUID %s at index %d doesn't match %s.\n", - tgt->uuid.uuid, index, uuidp->uuid); + obd_uuid2str(&tgt->ltd_uuid), index, + obd_uuid2str(uuidp)); RETURN(-EINVAL); } CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n", - tgt->uuid.uuid, index, tgt->ltd_gen, tgt->ltd_exp, tgt->active); + obd_uuid2str(&tgt->ltd_uuid), index, tgt->ltd_gen, + tgt->ltd_exp, tgt->ltd_active); lov_getref(obd); - tgt->reap = 1; - lov->death_row++; + tgt->ltd_reap = 1; + lov->lov_death_row++; /* we really delete it from lov_putref */ lov_putref(obd); @@ -569,10 +571,10 @@ static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) { struct obd_device *osc_obd; - LASSERT(tgt->reap); + LASSERT(tgt->ltd_reap); osc_obd = class_exp2obd(tgt->ltd_exp); - CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", tgt->uuid.uuid, + CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", obd_uuid2str(&tgt->ltd_uuid), osc_obd ? osc_obd->obd_name : ""); if (tgt->ltd_exp) @@ -627,8 +629,7 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) struct lustre_cfg *lcfg = buf; struct lov_desc *desc; struct lov_obd *lov = &obd->u.lov; - struct lov_tgt_desc *tgts; - int count, i; + int count; ENTRY; if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { @@ -672,22 +673,19 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) /* Allocate space for target list */ if (desc->ld_tgt_count) count = desc->ld_tgt_count; - lov->bufsize = sizeof(struct lov_tgt_desc) * max(count, 1); - OBD_ALLOC(lov->tgts, lov->bufsize); + lov->lov_tgt_bufsize = sizeof(struct lov_tgt_desc) * max(count, 1); + OBD_ALLOC(lov->tgts, lov->lov_tgt_bufsize); if (lov->tgts == NULL) { CERROR("Out of memory\n"); RETURN(-EINVAL); } - for (i = 0, tgts = lov->tgts; i < max(count, 1); i++, tgts++) { - tgts->index = i; - INIT_LIST_HEAD(&tgts->qos_bavail_list); - } desc->ld_active_tgt_count = 0; lov->desc = *desc; sema_init(&lov->lov_lock, 1); - atomic_set(&lov->refcount, 0); - INIT_LIST_HEAD(&lov->qos_bavail_list); + atomic_set(&lov->lov_refcount, 0); + INIT_LIST_HEAD(&lov->lov_qos.lq_oss_list); + init_rwsem(&lov->lov_qos.lq_rw_sem); lprocfs_init_vars(lov, &lvars); lprocfs_obd_setup(obd, lvars.obd_vars); @@ -717,7 +715,7 @@ static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) struct lov_obd *lov = &obd->u.lov; int i; for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->tgts[i].active) + if (!lov->tgts[i].ltd_active) continue; obd_precleanup(class_exp2obd(lov->tgts[i].ltd_exp), OBD_CLEANUP_EARLY); @@ -749,16 +747,20 @@ static int lov_cleanup(struct obd_device *obd) i < lov->desc.ld_tgt_count; i++, tgt++) { /* We should never get here - these should have been removed in the disconnect. */ - if (!obd_uuid_empty(&tgt->uuid)) { + if (!obd_uuid_empty(&tgt->ltd_uuid)) { CERROR("lov tgt %d not cleaned!" " deathrow=%d, lovrc=%d\n", - i, lov->death_row, - atomic_read(&lov->refcount)); - lov_del_target(obd, &tgt->uuid, i, 0); + i, lov->lov_death_row, + atomic_read(&lov->lov_refcount)); + lov_del_target(obd, &tgt->ltd_uuid, i, 0); } } - OBD_FREE(lov->tgts, lov->bufsize); + OBD_FREE(lov->tgts, lov->lov_tgt_bufsize); } + + if (lov->lov_qos.lq_rr_size) + OBD_FREE(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size); + RETURN(0); } @@ -881,15 +883,17 @@ static int lov_clear_orphans(struct obd_export *export, struct obdo *src_oa, /* if called for a specific target, we don't care if it is not active. */ - if (lov->tgts[i].active == 0 && ost_uuid == NULL) { + if (lov->tgts[i].ltd_active == 0 && ost_uuid == NULL) { CDEBUG(D_HA, "lov idx %d inactive\n", i); continue; } - if (ost_uuid && !obd_uuid_equals(ost_uuid, &lov->tgts[i].uuid)) + if (ost_uuid && + !obd_uuid_equals(ost_uuid, &lov->tgts[i].ltd_uuid)) continue; - CDEBUG(D_CONFIG,"Clear orphans for %d:%s\n", i, ost_uuid->uuid); + CDEBUG(D_CONFIG,"Clear orphans for %d:%s\n", i, + obd_uuid2str(ost_uuid)); memcpy(tmp_oa, src_oa, sizeof(*tmp_oa)); @@ -1241,7 +1245,7 @@ static int lov_setattr_async(struct obd_export *exp, struct obdo *src_oa, loi = lsm->lsm_oinfo; for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); goto next; } @@ -1370,7 +1374,7 @@ static int lov_brw_check(struct lov_obd *lov, struct obdo *oa, &start, &end)) continue; - if (lov->tgts[ost].active == 0) { + if (lov->tgts[ost].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", ost); return -EIO; } @@ -1672,7 +1676,7 @@ static int lov_trigger_group_io(struct obd_export *exp, loi = lsm->lsm_oinfo; for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } @@ -1903,14 +1907,14 @@ static int lov_cancel_unused(struct obd_export *exp, struct lov_stripe_md submd; int err; - if (lov->tgts[loi->loi_ost_idx].active == 0) + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); submd.lsm_object_id = loi->loi_id; submd.lsm_stripe_count = 0; err = obd_cancel_unused(lov->tgts[loi->loi_ost_idx].ltd_exp, &submd, flags, opaque); - if (err && lov->tgts[loi->loi_ost_idx].active) { + if (err && lov->tgts[loi->loi_ost_idx].ltd_active) { CERROR("error: cancel unused objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, loi->loi_id, loi->loi_ost_idx, err); @@ -1938,7 +1942,7 @@ static int lov_join_lru(struct obd_export *exp, struct lov_stripe_md submd; int rc = 0; - if (lov->tgts[loi->loi_ost_idx].active == 0) + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); submd.lsm_object_id = loi->loi_id; @@ -1980,7 +1984,7 @@ static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs, /* We only get block data from the OBD */ for (i = 0; i < lov->desc.ld_tgt_count; i++) { int err; - if (!lov->tgts[i].active) { + if (!lov->tgts[i].ltd_active) { CDEBUG(D_HA, "lov idx %d inactive\n", i); continue; } @@ -1988,7 +1992,7 @@ static int lov_statfs(struct obd_device *obd, struct obd_statfs *osfs, err = obd_statfs(class_exp2obd(lov->tgts[i].ltd_exp), &lov_sfs, max_age); if (err) { - if (lov->tgts[i].active && !rc) + if (lov->tgts[i].ltd_active && !rc) rc = err; continue; } @@ -2090,7 +2094,7 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, tgtdesc = lov->tgts; /* the uuid will be empty for deleted OSTs */ for (i = 0; i < count; i++, uuidp++, genp++, tgtdesc++) { - obd_str2uuid(uuidp, tgtdesc->uuid.uuid); + obd_str2uuid(uuidp, tgtdesc->ltd_uuid.uuid); *genp = tgtdesc->ltd_gen; } @@ -2128,11 +2132,11 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) { RETURN(err); } else if (err) { - if (lov->tgts[i].active) { + if (lov->tgts[i].ltd_active) { CERROR("error: iocontrol OSC %s on OST " "idx %d cmd %x: err = %d\n", - lov->tgts[i].uuid.uuid, i, - cmd, err); + obd_uuid2str(&lov->tgts[i].ltd_uuid), + i, cmd, err); if (!rc) rc = err; } @@ -2199,7 +2203,7 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, obd_id *ids = val; int size = sizeof(obd_id); for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->tgts[i].active) + if (!lov->tgts[i].ltd_active) continue; rc = obd_get_info(lov->tgts[i].ltd_exp, keylen, key, &size, &(ids[i])); @@ -2264,7 +2268,7 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, if (KEY_IS("evict_by_nid")) { for (i = 0; i < lov->desc.ld_tgt_count; i++) { /* OST was disconnected or is inactive */ - if (!lov->tgts[i].ltd_exp || !lov->tgts[i].active) + if (!lov->tgts[i].ltd_exp || !lov->tgts[i].ltd_active) continue; err = obd_set_info_async(lov->tgts[i].ltd_exp, keylen, @@ -2283,14 +2287,14 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, } for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (val && !obd_uuid_equals(val, &lov->tgts[i].uuid)) + if (val && !obd_uuid_equals(val, &lov->tgts[i].ltd_uuid)) continue; /* OST was disconnected */ if (!lov->tgts[i].ltd_exp) continue; - if (!val && !lov->tgts[i].active) + if (!val && !lov->tgts[i].ltd_active) continue; err = obd_set_info_async(lov->tgts[i].ltd_exp, diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index 84be134..dba9d2e 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -35,9 +35,341 @@ #include #include - #include "lov_internal.h" + +/* #define QOS_DEBUG 1 */ +#define D_QOS D_OTHER + +#define TGT_BAVAIL(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_bavail * \ + tgt->ltd_exp->exp_obd->obd_osfs.os_bsize) +#define TGT_FFREE(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_ffree) + + +int qos_add_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_qos_oss *oss, *temposs; + int rc = 0, found = 0; + ENTRY; + + if (!obd->obd_observer) + /* QOS is only on MDT, not clients */ + RETURN(0); + + if (!tgt->ltd_exp || !tgt->ltd_exp->exp_connection) { + CERROR("Missing connection\n"); + RETURN(-ENOTCONN); + } + + mutex_down(&lov->lov_lock); + list_for_each_entry(oss, &lov->lov_qos.lq_oss_list, lqo_oss_list) { + if (obd_uuid_equals(&oss->lqo_uuid, + &tgt->ltd_exp->exp_connection->c_remote_uuid)) { + found++; + break; + } + } + + if (!found) { + OBD_ALLOC_PTR(oss); + if (!oss) + GOTO(out, rc = -ENOMEM); + memcpy(&oss->lqo_uuid, + &tgt->ltd_exp->exp_connection->c_remote_uuid, + sizeof(oss->lqo_uuid)); + } else { + /* Assume we have to move this one */ + list_del(&oss->lqo_oss_list); + } + + oss->lqo_ost_count++; + tgt->ltd_qos.ltq_oss = oss; + + /* Add sorted by # of OSTs. Find the first entry that we're + bigger than... */ + list_for_each_entry(temposs, &lov->lov_qos.lq_oss_list, lqo_oss_list) { + if (oss->lqo_ost_count > temposs->lqo_ost_count) + break; + } + /* ...and add before it. If we're the first or smallest, temposs + points to the list head, and we add to the end. */ + list_add_tail(&oss->lqo_oss_list, &temposs->lqo_oss_list); + + lov->lov_qos.lq_dirty = 1; + lov->lov_qos.lq_dirty_rr = 1; + + CDEBUG(D_QOS, "add tgt %s to OSS %s (#%d)\n", + obd_uuid2str(&tgt->ltd_uuid), obd_uuid2str(&oss->lqo_uuid), + oss->lqo_ost_count); + +out: + mutex_up(&lov->lov_lock); + RETURN(rc); +} + +int qos_del_tgt(struct obd_device *obd, struct lov_tgt_desc *tgt) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_qos_oss *oss; + int rc = 0; + ENTRY; + + oss = tgt->ltd_qos.ltq_oss; + if (!oss) + RETURN(-ENOENT); + + oss->lqo_ost_count--; + if (oss->lqo_ost_count == 0) { + CDEBUG(D_QOS, "removing OSS %s\n", + obd_uuid2str(&oss->lqo_uuid)); + list_del(&oss->lqo_oss_list); + OBD_FREE_PTR(oss); + } + + lov->lov_qos.lq_dirty = 1; + lov->lov_qos.lq_dirty_rr = 1; + + RETURN(rc); +} + +/* Recalculate per-object penalties for OSSs and OSTs, + depends on size of each ost in an oss */ +static int qos_calc_ppo(struct obd_device *obd) +{ + struct lov_obd *lov = &obd->u.lov; + struct lov_qos_oss *oss; + struct lov_tgt_desc *tgt; + __u64 ba_max, ba_min, temp; + __u32 num_active; + int rc, i, prio_wide; + ENTRY; + + if (!lov->lov_qos.lq_dirty) + GOTO(out, rc = 0); + + num_active = lov->desc.ld_active_tgt_count - 1; + if (num_active < 1) + GOTO(out, rc = -EAGAIN); + + /* find bavail on each OSS */ + list_for_each_entry(oss, &lov->lov_qos.lq_oss_list, lqo_oss_list) { + oss->lqo_bavail = 0; + } + lov->lov_qos.lq_active_oss_count = 0; + + /* How badly user wants to select osts "widely" (not recently chosen + and not on recent oss's). As opposed to "freely" (free space + avail.) 0-256. */ + prio_wide = 256 - lov->lov_qos.lq_prio_free; + + ba_min = (__u64)(-1); + ba_max = 0; + /* Calculate OST penalty per object */ + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { + if (!tgt->ltd_active) + continue; + temp = TGT_BAVAIL(tgt); + if (!temp) + continue; + ba_min = min(temp, ba_min); + ba_max = max(temp, ba_max); + + /* Count the number of usable OSS's */ + if (tgt->ltd_qos.ltq_oss->lqo_bavail == 0) + lov->lov_qos.lq_active_oss_count++; + tgt->ltd_qos.ltq_oss->lqo_bavail += temp; + + /* per-OST penalty is prio * TGT_bavail / (num_ost - 1) / 2 */ + temp >>= 1; + do_div(temp, num_active); + tgt->ltd_qos.ltq_penalty_per_obj = (temp * prio_wide) >> 8; + + if (lov->lov_qos.lq_reset == 0) + tgt->ltd_qos.ltq_penalty = 0; + } + + num_active = lov->lov_qos.lq_active_oss_count - 1; + if (num_active < 1) { + /* If there's only 1 OSS, we can't penalize it, so instead + we have to double the OST penalty */ + num_active = 1; + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; + i++, tgt++) + tgt->ltd_qos.ltq_penalty_per_obj <<= 1; + } + + /* Per-OSS penalty is prio * oss_avail / oss_osts / (num_oss - 1) / 2 */ + list_for_each_entry(oss, &lov->lov_qos.lq_oss_list, lqo_oss_list) { + temp = oss->lqo_bavail >> 1; + do_div(temp, oss->lqo_ost_count * num_active); + oss->lqo_penalty_per_obj = (temp * prio_wide) >> 8; + if (lov->lov_qos.lq_reset == 0) + oss->lqo_penalty = 0; + } + + lov->lov_qos.lq_dirty = 0; + lov->lov_qos.lq_reset = 0; + + /* If each ost has almost same free space, + * do rr allocation for better creation performance */ + lov->lov_qos.lq_same_space = 0; + temp = ba_max - ba_min; + ba_min = (ba_min * 51) >> 8; /* 51/256 = .20 */ + if (temp < ba_min) { + /* Difference is less than 20% */ + lov->lov_qos.lq_same_space = 1; + /* Reset weights for the next time we enter qos mode */ + lov->lov_qos.lq_reset = 0; + } + rc = 0; + +out: + if (!rc && lov->lov_qos.lq_same_space) + RETURN(-EAGAIN); + RETURN(rc); +} + +static int qos_calc_weight(struct lov_tgt_desc *tgt) +{ + __u64 temp, temp2; + + /* Final ost weight = TGT_BAVAIL - ost_penalty - oss_penalty */ + temp = TGT_BAVAIL(tgt); + temp2 = tgt->ltd_qos.ltq_penalty + + tgt->ltd_qos.ltq_oss->lqo_penalty; + if (temp < temp2) + tgt->ltd_qos.ltq_weight = 0; + else + tgt->ltd_qos.ltq_weight = temp - temp2; + return 0; +} + +/* We just used this index for a stripe; adjust everyone's weights */ +static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt) +{ + struct lov_qos_oss *oss; + struct lov_tgt_desc *tgt = &lov->tgts[index]; + int i; + ENTRY; + + /* Don't allocate from this stripe anymore, until the next alloc_qos */ + tgt->ltd_qos.ltq_usable = 0; + + oss = tgt->ltd_qos.ltq_oss; + + /* Decay old penalty by half (we're adding max penalty, and don't + want it to run away.) */ + tgt->ltd_qos.ltq_penalty >>= 1; + oss->lqo_penalty >>= 1; + + /* Set max penalties for this OST and OSS */ + tgt->ltd_qos.ltq_penalty += tgt->ltd_qos.ltq_penalty_per_obj * + lov->desc.ld_active_tgt_count; + oss->lqo_penalty += oss->lqo_penalty_per_obj * + lov->lov_qos.lq_active_oss_count; + + /* Decrease all OSS penalties */ + list_for_each_entry(oss, &lov->lov_qos.lq_oss_list, lqo_oss_list) { + if (oss->lqo_penalty < oss->lqo_penalty_per_obj) + oss->lqo_penalty = 0; + else + oss->lqo_penalty -= oss->lqo_penalty_per_obj; + } + + *total_wt = 0; + /* Decrease all OST penalties */ + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { + if (!tgt->ltd_active) + continue; + if (tgt->ltd_qos.ltq_penalty < tgt->ltd_qos.ltq_penalty_per_obj) + tgt->ltd_qos.ltq_penalty = 0; + else + tgt->ltd_qos.ltq_penalty -= + tgt->ltd_qos.ltq_penalty_per_obj; + + qos_calc_weight(tgt); + + /* Recalc the total weight of usable osts */ + if (tgt->ltd_qos.ltq_usable) + *total_wt += tgt->ltd_qos.ltq_weight; + +#ifdef QOS_DEBUG + CDEBUG(D_QOS, "recalc tgt %d avail="LPU64 + " ostppo="LPU64" ostp="LPU64" ossppo="LPU64 + " ossp="LPU64" wt="LPU64"\n", + i, TGT_BAVAIL(tgt), + tgt->ltd_qos.ltq_penalty_per_obj, + tgt->ltd_qos.ltq_penalty, + tgt->ltd_qos.ltq_oss->lqo_penalty_per_obj, + tgt->ltd_qos.ltq_oss->lqo_penalty, + tgt->ltd_qos.ltq_weight); +#endif + } + + RETURN(0); +} + +#define LOV_QOS_EMPTY ((__u32)-1) +/* compute optimal round-robin order, based on OSTs per OSS */ +static int qos_calc_rr(struct lov_obd *lov) +{ + struct lov_qos_oss *oss; + struct lov_tgt_desc *tgt; + unsigned ost_count; + int i; + ENTRY; + + if (!lov->lov_qos.lq_dirty_rr) { + LASSERT(lov->lov_qos.lq_rr_size); + RETURN(0); + } + + down_write(&lov->lov_qos.lq_rw_sem); + ost_count = lov->desc.ld_tgt_count; + + if (lov->lov_qos.lq_rr_size) + OBD_FREE(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size); + lov->lov_qos.lq_rr_size = ost_count * + sizeof(lov->lov_qos.lq_rr_array[0]); + OBD_ALLOC(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size); + if (!lov->lov_qos.lq_rr_array) { + lov->lov_qos.lq_rr_size = 0; + up_write(&lov->lov_qos.lq_rw_sem); + RETURN(-ENOMEM); + } + + for (i = 0; i < ost_count; i++) + lov->lov_qos.lq_rr_array[i] = LOV_QOS_EMPTY; + + /* Place all the OSTs from 1 OSS at the same time. */ + list_for_each_entry(oss, &lov->lov_qos.lq_oss_list, lqo_oss_list) { + int j = 0; + for (i = 0, tgt = lov->tgts; i < ost_count; i++, tgt++) { + if (tgt->ltd_qos.ltq_oss == oss) { + /* Evenly space these OSTs across arrayspace */ + int next = j * ost_count / oss->lqo_ost_count; + while (lov->lov_qos.lq_rr_array[next] != LOV_QOS_EMPTY) + next = (next + 1) % ost_count; + lov->lov_qos.lq_rr_array[next] = i; + j++; + } + } + LASSERT(j == oss->lqo_ost_count); + } + + lov->lov_qos.lq_dirty_rr = 0; + + up_write(&lov->lov_qos.lq_rw_sem); + +#ifdef QOS_DEBUG + for (i = 0; i < ost_count; i++) + CDEBUG(D_QOS, "idx %d rr %d\n", i, lov->lov_qos.lq_rr_array[i]); +#endif + RETURN(0); +} + + void qos_shrink_lsm(struct lov_request_set *set) { struct lov_stripe_md *lsm = set->set_md, *lsm_new; @@ -86,10 +418,8 @@ int qos_remedy_create(struct lov_request_set *set, struct lov_request *req) ost_idx = (req->rq_idx + lsm->lsm_stripe_count) % ost_count; for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { - if (lov->tgts[ost_idx].active == 0) { - CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx); + if (lov->tgts[ost_idx].ltd_active == 0) continue; - } /* check if objects has been created on this ost */ for (stripe = 0; stripe < lsm->lsm_stripe_count; stripe++) { if (stripe == req->rq_stripe) @@ -119,7 +449,11 @@ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt) unsigned ost_active_count = lov->desc.ld_active_tgt_count; int i, *idx_pos = idx_arr; ENTRY; - + + i = qos_calc_rr(lov); + if (i) + RETURN(i); + if (--ost_start_count <= 0) { ost_start_idx = ll_rand(); ost_start_count = @@ -132,20 +466,23 @@ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt) } ost_idx = ost_start_idx % ost_count; + down_read(&lov->lov_qos.lq_rw_sem); for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { ++ost_start_idx; - if (lov->tgts[ost_idx].active == 0) { - CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx); + if (lov->tgts[ost_idx].ltd_active == 0) continue; - } - *idx_pos = ost_idx; + *idx_pos = lov->lov_qos.lq_rr_array[ost_idx]; idx_pos++; /* got enough ost */ - if (idx_pos - idx_arr == *stripe_cnt) + if (idx_pos - idx_arr == *stripe_cnt) { + up_read(&lov->lov_qos.lq_rw_sem); RETURN(0); + } } + up_read(&lov->lov_qos.lq_rw_sem); + *stripe_cnt = idx_pos - idx_arr; RETURN(0); } @@ -160,8 +497,7 @@ static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm, ost_idx = lsm->lsm_oinfo[0].loi_ost_idx; for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { - if (lov->tgts[ost_idx].active == 0) { - CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx); + if (lov->tgts[ost_idx].ltd_active == 0) { continue; } *idx_pos = ost_idx; @@ -181,165 +517,126 @@ static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm, RETURN(-EFBIG); } -/* free space OST must have to be used for object allocation. */ -#define QOS_MIN (lov->desc.ld_qos_threshold << 20) - -#define TGT_BAVAIL(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_bavail * \ - tgt->ltd_exp->exp_obd->obd_osfs.os_bsize) -#define TGT_FFREE(tgt) (tgt->ltd_exp->exp_obd->obd_osfs.os_ffree) - -/* alloc objects on osts with free space weighted algorithm */ +/* Alloc objects on osts with optimization based on: + - free space + - network resources (shared OSS's) +*/ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt) { struct lov_obd *lov = &exp->exp_obd->u.lov; - unsigned ost_count = lov->desc.ld_tgt_count; - __u64 cur_bavail, rand, *availspace, total_bavail = 0; - int *indexes, nfound, good_osts, i, warn = 0, rc = 0; struct lov_tgt_desc *tgt; - int shift, require_stripes = *stripe_cnt; static time_t last_warn = 0; time_t now = cfs_time_current_sec(); + __u64 cur_weight, temp, rand, bavail, total_bavail, total_weight = 0; + __u32 ost_count; + int nfound, good_osts, i, warn = 0, rc = 0; ENTRY; - availspace = NULL; - indexes = NULL; - OBD_ALLOC(availspace, sizeof(__u64) * ost_count); - OBD_ALLOC(indexes, sizeof(int) * require_stripes); - if (!availspace || !indexes) - GOTO(out_free, rc = -EAGAIN); - - mutex_down(&lov->lov_lock); - - /* if free space is below some threshold, just go - * to do round-robin allocation */ - total_bavail = (exp->exp_obd->obd_osfs.os_bavail * \ - exp->exp_obd->obd_osfs.os_bsize); - if (ost_count < 2 || total_bavail <= QOS_MIN) { - mutex_up(&lov->lov_lock); - GOTO(out_free, rc = -EAGAIN); - } + lov_getref(exp->exp_obd); + down_write(&lov->lov_qos.lq_rw_sem); - /* if each ost has almost same free space, go to - * do rr allocation for better creation performance */ - if (!list_empty(&lov->qos_bavail_list)) { - __u64 max, min, val; - tgt = list_entry(lov->qos_bavail_list.next, - struct lov_tgt_desc, qos_bavail_list); - max = TGT_BAVAIL(tgt); - tgt = list_entry(lov->qos_bavail_list.prev, - struct lov_tgt_desc, qos_bavail_list); - min = TGT_BAVAIL(tgt); - - val = (max >= min) ? (max - min) : (min - max); - min = (min * 13) >> 8; /* less than 5% of gap */ - - if (val < min) { - mutex_up(&lov->lov_lock); - GOTO(out_free, rc = -EAGAIN); - } - } else { - mutex_up(&lov->lov_lock); - GOTO(out_free, rc = -EAGAIN); - } + ost_count = lov->desc.ld_tgt_count; + + if (lov->desc.ld_active_tgt_count < 2) + GOTO(out, rc = -EAGAIN); + + rc = qos_calc_ppo(exp->exp_obd); + if (rc) + GOTO(out, rc); total_bavail = 0; good_osts = 0; - /* warn zero available space/inode every 30 min */ + /* Warn users about zero available space/inode every 30 min */ if (cfs_time_sub(now, last_warn) > 60 * 30) warn = 1; - /* Find all the OSTs big enough to be stripe candidates */ - list_for_each_entry(tgt, &lov->qos_bavail_list, qos_bavail_list) { - if (!tgt->active) + /* Find all the OSTs that are valid stripe candidates */ + for (i = 0, tgt = lov->tgts; i < ost_count; i++, tgt++) { + if (!tgt->ltd_active) continue; - if (!TGT_BAVAIL(tgt)) { + bavail = TGT_BAVAIL(tgt); + if (!bavail) { if (warn) { - CWARN("no free space on %s\n", - tgt->uuid.uuid); + CDEBUG(D_QOS, "no free space on %s\n", + obd_uuid2str(&tgt->ltd_uuid)); last_warn = now; } continue; } if (!TGT_FFREE(tgt)) { if (warn) { - CWARN("no free inodes on %s\n", - tgt->uuid.uuid); + CDEBUG(D_QOS, "no free inodes on %s\n", + obd_uuid2str(&tgt->ltd_uuid)); last_warn = now; } continue; } - /* We can stop if we have enough good osts and our osts - are getting too small */ - if ((TGT_BAVAIL(tgt) <= QOS_MIN) && (good_osts >= *stripe_cnt)) - break; - availspace[good_osts] = TGT_BAVAIL(tgt); - indexes[good_osts] = tgt->index; - total_bavail += availspace[good_osts]; + + tgt->ltd_qos.ltq_usable = 1; + qos_calc_weight(tgt); + total_bavail += bavail; + total_weight += tgt->ltd_qos.ltq_weight; + good_osts++; } - mutex_up(&lov->lov_lock); - if (!total_bavail) - GOTO(out_free, rc = -ENOSPC); + GOTO(out, rc = -ENOSPC); /* if we don't have enough good OSTs, we reduce the stripe count. */ if (good_osts < *stripe_cnt) *stripe_cnt = good_osts; if (!*stripe_cnt) - GOTO(out_free, rc = -EAGAIN); + GOTO(out, rc = -EAGAIN); - /* The point of all this shift and rand is to choose a 64-bit - random number between 0 and total_bavail. Apparently '%' doesn't - work for 64bit numbers. */ - nfound = shift = 0; - while ((total_bavail >> shift) > 0) - shift++; - shift++; - /* Find enough OSTs with free space weighted random allocation */ + /* Find enough OSTs with weighted random allocation. */ + nfound = 0; while (nfound < *stripe_cnt) { - cur_bavail = 0; - - /* If the total storage left is < 4GB, don't use random order, - store in biggest OST first. (Low storage situation.) - Otherwise, choose a 64bit random number... */ - rand = (shift < 32 ? 0ULL : (__u64)ll_rand() << 32) | ll_rand(); - /* ... mask everything above shift... */ - if (shift < 64) - rand &= ((1ULL << shift) - 1); - /* ... and this while should execute at most once... */ - while (rand > total_bavail) - rand -= total_bavail; - /* ... leaving us a 64bit number between 0 and total_bavail. */ - - /* Try to fit in bigger OSTs first. On average, this will - fill more toward the front of the OST array */ - for (i = 0; i < good_osts; i++) { - cur_bavail += availspace[i]; - if (cur_bavail >= rand) { - total_bavail -= availspace[i]; - availspace[i] = 0; - idx_arr[nfound] = indexes[i]; - nfound++; + cur_weight = 0; + rc = -ENODEV; + + if (total_weight) { + /* If total_weight > 32-bit, make a 64-bit random # */ + temp = (total_weight & 0xffffffff00000000ULL ? + (__u64)ll_rand() << 32 : 0ULL) | ll_rand(); + /* Random number between 0 and total_weight */ + rand = do_div(temp, total_weight); + } else { + rand = 0; + } + + /* On average, this will hit larger-weighted osts more often. + 0-weight osts will always get used last (only when rand=0).*/ + for (i = 0, tgt = lov->tgts; i < ost_count; i++, tgt++) { + if (!tgt->ltd_qos.ltq_usable) + continue; + cur_weight += tgt->ltd_qos.ltq_weight; + if (cur_weight >= rand) { +#ifdef QOS_DEBUG + CDEBUG(D_QOS, "assigned stripe=%d to idx=%d\n", + nfound, i); +#endif + idx_arr[nfound++] = i; + qos_used(lov, i, &total_weight); + rc = 0; break; } } /* should never satisfy below condition */ - if (cur_bavail == 0) + if (rc) { + CERROR("Didn't find any OSTs?\n"); break; + } } LASSERT(nfound == *stripe_cnt); -out_free: - if (availspace) - OBD_FREE(availspace, sizeof(__u64) * ost_count); - if (indexes) - OBD_FREE(indexes, sizeof(int) * require_stripes); - if (rc != -EAGAIN) - /* rc == 0 or err */ - RETURN(rc); - - rc = alloc_rr(lov, idx_arr, stripe_cnt); +out: + up_write(&lov->lov_qos.lq_rw_sem); + + if (rc == -EAGAIN) + rc = alloc_rr(lov, idx_arr, stripe_cnt); + + lov_putref(exp->exp_obd); RETURN(rc); } @@ -403,19 +700,18 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) * This may mean we use more than the default # of stripes. */ if (src_oa->o_valid & OBD_MD_FLSIZE) { struct lov_tgt_desc *tgt; + obd_size min_bavail = (obd_size)-1; - /* Find the smallest number of stripes we can use + /* Find a small number of stripes we can use (up to # of active osts). */ stripes = 1; mutex_down(&lov->lov_lock); - list_for_each_entry(tgt, &lov->qos_bavail_list, - qos_bavail_list) { - if (!tgt->active) + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; + i++, tgt++) { + if (!tgt->ltd_active) continue; - /* All earlier tgts have at least this many - bytes available also, since our list is - sorted by size */ - if (TGT_BAVAIL(tgt) * stripes > src_oa->o_size) + min_bavail = min(min_bavail, TGT_BAVAIL(tgt)); + if (min_bavail * stripes > src_oa->o_size) break; stripes++; } @@ -536,18 +832,7 @@ out_err: void qos_update(struct lov_obd *lov, int idx, struct obd_statfs *osfs) { - struct lov_tgt_desc *tgt = &lov->tgts[idx]; - __u64 bavail; ENTRY; - - bavail = osfs->os_bavail * osfs->os_bsize; - if (!bavail) - CWARN("ost %d has zero avail space!\n", idx); - - CDEBUG(D_OTHER, "QOS: bfree now "LPU64"\n", bavail); - - mutex_down(&lov->lov_lock); - list_adjust(tgt, lov, qos_bavail_list, TGT_BAVAIL); - mutex_up(&lov->lov_lock); + lov->lov_qos.lq_dirty = 1; } diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index b39bb1a..65800ab 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -97,7 +97,7 @@ int lov_update_common_set(struct lov_request_set *set, lov_update_set(set, req, rc); /* grace error on inactive ost */ - if (rc && !lov->tgts[req->rq_idx].active) + if (rc && !lov->tgts[req->rq_idx].ltd_active) rc = 0; /* FIXME in raid1 regime, should return 0 */ @@ -165,7 +165,7 @@ int lov_update_enqueue_set(struct lov_request_set *set, struct lov_obd *lov = &exp->exp_obd->u.lov; memset(lov_lockhp, 0, sizeof(*lov_lockhp)); - if (lov->tgts[req->rq_idx].active) { + if (lov->tgts[req->rq_idx].ltd_active) { CERROR("error: enqueue objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", set->set_md->lsm_object_id, loi->loi_id, @@ -203,7 +203,7 @@ static int enqueue_done(struct lov_request_set *set, __u32 mode) rc = obd_cancel(lov->tgts[req->rq_idx].ltd_exp, req->rq_md, mode, lov_lockhp); - if (rc && lov->tgts[req->rq_idx].active) + if (rc && lov->tgts[req->rq_idx].ltd_active) CERROR("cancelling obdjid "LPX64" on OST " "idx %d error: rc = %d\n", req->rq_md->lsm_object_id, req->rq_idx, rc); @@ -263,7 +263,7 @@ int lov_prep_enqueue_set(struct obd_export *exp, struct lov_stripe_md *lsm, policy->l_extent.end, &start, &end)) continue; - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } @@ -372,7 +372,7 @@ int lov_prep_match_set(struct obd_export *exp, struct lov_stripe_md *lsm, continue; /* FIXME raid1 should grace this error */ - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); GOTO(out_set, rc = -EIO); } @@ -618,7 +618,7 @@ int lov_update_create_set(struct lov_request_set *set, req->rq_stripe = set->set_success; loi = &lsm->lsm_oinfo[req->rq_stripe]; - if (rc && lov->tgts[req->rq_idx].active) { + if (rc && lov->tgts[req->rq_idx].ltd_active) { CERROR("error creating fid "LPX64" sub-object" " on OST idx %d/%d: rc = %d\n", set->set_oa->o_id, req->rq_idx, @@ -805,7 +805,7 @@ int lov_prep_brw_set(struct obd_export *exp, struct obdo *src_oa, if (info[i].count == 0) continue; - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); GOTO(out, rc = -EIO); } @@ -914,7 +914,7 @@ int lov_prep_getattr_set(struct obd_export *exp, struct obdo *src_oa, for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { struct lov_request *req; - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } @@ -989,7 +989,7 @@ int lov_prep_destroy_set(struct obd_export *exp, struct obdo *src_oa, for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { struct lov_request *req; - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } @@ -1066,7 +1066,7 @@ int lov_prep_setattr_set(struct obd_export *exp, struct obdo *src_oa, for (i = 0; i < lsm->lsm_stripe_count; i++, loi++) { struct lov_request *req; - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } @@ -1115,7 +1115,7 @@ int lov_update_setattr_set(struct lov_request_set *set, lov_update_set(set, req, rc); /* grace error on inactive ost */ - if (rc && !lov->tgts[req->rq_idx].active) + if (rc && !lov->tgts[req->rq_idx].ltd_active) rc = 0; /* FIXME: LOV STACKING update loi data should be done by OSC * @@ -1142,7 +1142,7 @@ int lov_update_punch_set(struct lov_request_set *set, struct lov_request *req, ENTRY; lov_update_set(set, req, rc); - if (rc && !lov->tgts[req->rq_idx].active) + if (rc && !lov->tgts[req->rq_idx].ltd_active) rc = 0; /* FIXME in raid1 regime, should return 0 */ RETURN(rc); @@ -1193,7 +1193,7 @@ int lov_prep_punch_set(struct obd_export *exp, struct obdo *src_oa, struct lov_request *req; obd_off rs, re; - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } @@ -1275,7 +1275,7 @@ int lov_prep_sync_set(struct obd_export *exp, struct obdo *src_oa, struct lov_request *req; obd_off rs, re; - if (lov->tgts[loi->loi_ost_idx].active == 0) { + if (lov->tgts[loi->loi_ost_idx].ltd_active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } diff --git a/lustre/lov/lproc_lov.c b/lustre/lov/lproc_lov.c index 5ae9f62..06da469 100644 --- a/lustre/lov/lproc_lov.c +++ b/lustre/lov/lproc_lov.c @@ -118,8 +118,9 @@ static int lov_rd_desc_uuid(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n", lov->desc.ld_uuid.uuid); } -static int lov_rd_qos_threshold(char *page, char **start, off_t off, int count, - int *eof, void *data) +/* free priority (0-255): how badly user wants to choose empty osts */ +static int lov_rd_qos_priofree(char *page, char **start, off_t off, int count, + int *eof, void *data) { struct obd_device *dev = (struct obd_device*) data; struct lov_obd *lov; @@ -127,11 +128,12 @@ static int lov_rd_qos_threshold(char *page, char **start, off_t off, int count, LASSERT(dev != NULL); lov = &dev->u.lov; *eof = 1; - return snprintf(page, count, "%u MB\n", lov->desc.ld_qos_threshold); + return snprintf(page, count, "%d%%\n", + (lov->lov_qos.lq_prio_free * 100) >> 8); } -static int lov_wr_qos_threshold(struct file *file, const char *buffer, - unsigned long count, void *data) +static int lov_wr_qos_priofree(struct file *file, const char *buffer, + unsigned long count, void *data) { struct obd_device *dev = (struct obd_device *)data; struct lov_obd *lov; @@ -143,9 +145,11 @@ static int lov_wr_qos_threshold(struct file *file, const char *buffer, if (rc) return rc; - if (val <= 0) + if (val > 100) return -EINVAL; - lov->desc.ld_qos_threshold = val; + lov->lov_qos.lq_prio_free = (val << 8) / 100; + lov->lov_qos.lq_dirty = 1; + lov->lov_qos.lq_reset = 1; return count; } @@ -208,8 +212,9 @@ static int lov_tgt_seq_show(struct seq_file *p, void *v) struct obd_device *dev = p->private; struct lov_obd *lov = &dev->u.lov; int idx = tgt - &(lov->tgts[0]); - return seq_printf(p, "%d: %s %sACTIVE\n", idx, tgt->uuid.uuid, - tgt->active ? "" : "IN"); + return seq_printf(p, "%d: %s %sACTIVE\n", idx, + obd_uuid2str(&tgt->ltd_uuid), + tgt->ltd_active ? "" : "IN"); } struct seq_operations lov_tgt_sops = { @@ -250,7 +255,7 @@ struct lprocfs_vars lprocfs_obd_vars[] = { { "kbytesfree", lprocfs_rd_kbytesfree, 0, 0 }, { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, { "desc_uuid", lov_rd_desc_uuid, 0, 0 }, - { "qos_threshold",lov_rd_qos_threshold, lov_wr_qos_threshold, 0 }, + { "qos_prio_free", lov_rd_qos_priofree, lov_wr_qos_priofree, 0 }, { "qos_maxage", lov_rd_qos_maxage, lov_wr_qos_maxage, 0 }, { 0 } }; diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index c84159d..52ef4dc 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -783,7 +783,6 @@ void lustre_swab_lov_desc (struct lov_desc *ld) __swab64s (&ld->ld_default_stripe_size); __swab64s (&ld->ld_default_stripe_offset); __swab32s (&ld->ld_pattern); - __swab32s (&ld->ld_qos_threshold); __swab32s (&ld->ld_qos_maxage); /* uuid endian insensitive */ } @@ -2101,10 +2100,6 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); - LASSERTF((int)offsetof(struct lov_desc, ld_qos_threshold) == 32, " found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_qos_threshold)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_threshold) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_threshold)); LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, " found %lld\n", diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c index 6086088..d733ede 100644 --- a/lustre/quota/quota_check.c +++ b/lustre/quota/quota_check.c @@ -224,13 +224,13 @@ int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl) for (i = 0; i < lov->desc.ld_tgt_count; i++) { int err; - if (!lov->tgts[i].active) { + if (!lov->tgts[i].ltd_active) { CERROR("lov idx %d inactive\n", i); RETURN(-EIO); } err = obd_quotacheck(lov->tgts[i].ltd_exp, oqctl); - if (err && lov->tgts[i].active && !rc) + if (err && lov->tgts[i].ltd_active && !rc) rc = err; } diff --git a/lustre/quota/quota_ctl.c b/lustre/quota/quota_ctl.c index a8c4317..d3354b4 100644 --- a/lustre/quota/quota_ctl.c +++ b/lustre/quota/quota_ctl.c @@ -239,7 +239,7 @@ int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) for (i = 0; i < lov->desc.ld_tgt_count; i++) { int err; - if (!lov->tgts[i].active) { + if (!lov->tgts[i].ltd_active) { if (oqctl->qc_cmd == Q_GETOQUOTA) { CERROR("ost %d is inactive\n", i); rc = -EIO; @@ -252,7 +252,7 @@ int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) err = obd_quotactl(lov->tgts[i].ltd_exp, oqctl); if (err) { - if (lov->tgts[i].active && !rc) + if (lov->tgts[i].ltd_active && !rc) rc = err; continue; } diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c index 342a4da..2f972d3 100644 --- a/lustre/utils/lustre_cfg.c +++ b/lustre/utils/lustre_cfg.c @@ -389,7 +389,6 @@ int jt_lcfg_lov_setup(int argc, char **argv) jt_cmdname(argv[0]), argv[5]); return CMD_HELP; } - desc.ld_qos_threshold = QOS_DEFAULT_THRESHOLD; desc.ld_qos_maxage = QOS_DEFAULT_MAXAGE; if (argc == 7) { diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index fbaa08a..d8ea96f 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -494,7 +494,6 @@ check_lov_desc(void) CHECK_MEMBER(lov_desc, ld_pattern); CHECK_MEMBER(lov_desc, ld_default_stripe_size); CHECK_MEMBER(lov_desc, ld_default_stripe_offset); - CHECK_MEMBER(lov_desc, ld_qos_threshold); CHECK_MEMBER(lov_desc, ld_qos_maxage); CHECK_MEMBER(lov_desc, ld_padding_1); CHECK_MEMBER(lov_desc, ld_padding_2); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index f2b33f8..4867401 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -1170,10 +1170,6 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, " found %lld\n", (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); - LASSERTF((int)offsetof(struct lov_desc, ld_qos_threshold) == 32, " found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_qos_threshold)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_threshold) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_threshold)); LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, " found %lld\n", (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, " found %lld\n", -- 1.8.3.1