From d9b4bc5476c779aaaee6797e5e148b5e0b771980 Mon Sep 17 00:00:00 2001 From: Rahul Deshmkuh Date: Mon, 18 May 2015 10:51:53 +0530 Subject: [PATCH 1/1] LU-977 lod: Patch to protect lqr_start_idx Protect lqr_start_idx to avoid the imblance in allocating objects on OSTs with round-robin algorithm Signed-off-by: Rahul Deshmukh Change-Id: I689ba1c4e8c9224cc67badba7fd6cf45e64dd7b6 Seagate-bug-id: MRP-2471 Reviewed-on: http://review.whamcloud.com/14636 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/lod/lod_internal.h | 6 +- lustre/lod/lod_lov.c | 1 + lustre/lod/lod_pool.c | 2 +- lustre/lod/lod_qos.c | 148 +++++++++++++++++++++++++++------------------- 4 files changed, 93 insertions(+), 64 deletions(-) diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index 6f1bff1..cd9f3a1 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -56,9 +56,10 @@ #define LOV_OFFSET_DEFAULT ((__u16)-1) struct lod_qos_rr { + spinlock_t lqr_alloc; /* protect allocation index */ __u32 lqr_start_idx; /* start index of new inode */ - __u32 lqr_offset_idx; /* aliasing for start_idx */ - int lqr_start_count; /* reseed counter */ + __u32 lqr_offset_idx;/* aliasing for start_idx */ + int lqr_start_count;/* reseed counter */ struct ost_pool lqr_pool; /* round-robin optimized list */ unsigned long lqr_dirty:1; /* recalc round-robin list */ }; @@ -474,6 +475,7 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, struct thandle *th); int qos_add_tgt(struct lod_device*, struct lod_tgt_desc *); int qos_del_tgt(struct lod_device *, struct lod_tgt_desc *); +void lod_qos_rr_init(struct lod_qos_rr *lqr); /* lproc_lod.c */ int lod_procfs_init(struct lod_device *lod); diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index 63b5018..0ae2f5d 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -1314,6 +1314,7 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg) rc = lod_ost_pool_init(&lod->lod_pool_info, 0); if (rc) GOTO(out_hash, rc); + lod_qos_rr_init(&lod->lod_qos.lq_rr); rc = lod_ost_pool_init(&lod->lod_qos.lq_rr.lqr_pool, 0); if (rc) GOTO(out_pool_info, rc); diff --git a/lustre/lod/lod_pool.c b/lustre/lod/lod_pool.c index 5caef4c..da5389c 100644 --- a/lustre/lod/lod_pool.c +++ b/lustre/lod/lod_pool.c @@ -661,7 +661,7 @@ int lod_pool_new(struct obd_device *obd, char *poolname) if (rc) GOTO(out_err, rc); - memset(&new_pool->pool_rr, 0, sizeof(new_pool->pool_rr)); + lod_qos_rr_init(&new_pool->pool_rr); rc = lod_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0); if (rc) GOTO(out_free_pool_obds, rc); diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index d447d31..1cd345206 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -525,6 +525,13 @@ static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts, RETURN(0); } +void lod_qos_rr_init(struct lod_qos_rr *lqr) +{ + spin_lock_init(&lqr->lqr_alloc); + lqr->lqr_dirty = 1; +} + + #define LOV_QOS_EMPTY ((__u32)-1) /** @@ -825,6 +832,74 @@ static int lod_qos_is_ost_used(const struct lu_env *env, int ost, __u32 stripes) return 0; } +static int lod_check_and_reserve_ost(const struct lu_env *env, + struct lod_device *m, + struct obd_statfs *sfs, __u32 ost_idx, + __u32 speed, __u32 *s_idx, + struct dt_object **stripe, + struct thandle *th) +{ + struct dt_object *o; + __u32 stripe_idx = *s_idx; + int rc; + + rc = lod_statfs_and_check(env, m, ost_idx, sfs); + if (rc) { + /* this OSP doesn't feel well */ + goto out_return; + } + + /* + * skip full devices + */ + if (lod_qos_dev_is_full(sfs)) { + QOS_DEBUG("#%d is full\n", ost_idx); + goto out_return; + } + + /* + * We expect number of precreated objects in f_ffree at + * the first iteration, skip OSPs with no objects ready + */ + if (sfs->os_fprecreated == 0 && speed == 0) { + QOS_DEBUG("#%d: precreation is empty\n", ost_idx); + goto out_return; + } + + /* + * try to use another OSP if this one is degraded + */ + if (sfs->os_state & OS_STATE_DEGRADED && speed < 2) { + QOS_DEBUG("#%d: degraded\n", ost_idx); + goto out_return; + } + + /* + * do not put >1 objects on a single OST + */ + if (speed && lod_qos_is_ost_used(env, ost_idx, stripe_idx)) + goto out_return; + + o = lod_qos_declare_object_on(env, m, ost_idx, th); + if (IS_ERR(o)) { + CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n", + ost_idx, (int) PTR_ERR(o)); + rc = PTR_ERR(o); + goto out_return; + } + + /* + * We've successfully declared (reserved) an object + */ + lod_qos_ost_in_use(env, stripe_idx, ost_idx); + stripe[stripe_idx] = o; + stripe_idx++; + *s_idx = stripe_idx; + +out_return: + return rc; +} + /** * Allocate a striping using round-robin algorithm. * @@ -858,7 +933,6 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, struct pool_desc *pool = NULL; struct ost_pool *osts; struct lod_qos_rr *lqr; - struct dt_object *o; unsigned int i, array_idx; int rc; __u32 ost_start_idx_temp; @@ -889,6 +963,8 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, if (rc) GOTO(out, rc); + down_read(&m->lod_qos.lq_rw_sem); + spin_lock(&lqr->lqr_alloc); if (--lqr->lqr_start_count <= 0) { lqr->lqr_start_idx = cfs_rand() % osts->op_count; lqr->lqr_start_count = @@ -903,22 +979,19 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, if (stripe_cnt > 1 && (osts->op_count % stripe_cnt) != 1) ++lqr->lqr_offset_idx; } - down_read(&m->lod_qos.lq_rw_sem); ost_start_idx_temp = lqr->lqr_start_idx; repeat_find: - array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % - osts->op_count; QOS_DEBUG("pool '%s' want %d startidx %d startcnt %d offset %d " - "active %d count %d arrayidx %d\n", + "active %d count %d\n", lo->ldo_pool ? lo->ldo_pool : "", stripe_cnt, lqr->lqr_start_idx, lqr->lqr_start_count, - lqr->lqr_offset_idx, osts->op_count, osts->op_count, - array_idx); + lqr->lqr_offset_idx, osts->op_count, osts->op_count); - for (i = 0; i < osts->op_count && stripe_idx < lo->ldo_stripenr; - i++, array_idx = (array_idx + 1) % osts->op_count) { + for (i = 0; i < osts->op_count && stripe_idx < lo->ldo_stripenr; i++) { + array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % + osts->op_count; ++lqr->lqr_start_idx; ost_idx = lqr->lqr_pool.op_array[array_idx]; @@ -935,58 +1008,10 @@ repeat_find: if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0) continue; - rc = lod_statfs_and_check(env, m, ost_idx, sfs); - if (rc) { - /* this OSP doesn't feel well */ - continue; - } - - /* - * skip full devices - */ - if (lod_qos_dev_is_full(sfs)) { - QOS_DEBUG("#%d is full\n", ost_idx); - continue; - } - - /* - * We expect number of precreated objects in f_ffree at - * the first iteration, skip OSPs with no objects ready - */ - if (sfs->os_fprecreated == 0 && speed == 0) { - QOS_DEBUG("#%d: precreation is empty\n", ost_idx); - continue; - } - - /* - * try to use another OSP if this one is degraded - */ - if (sfs->os_state & OS_STATE_DEGRADED && speed < 2) { - QOS_DEBUG("#%d: degraded\n", ost_idx); - continue; - } - - /* - * do not put >1 objects on a single OST - */ - if (speed && lod_qos_is_ost_used(env, ost_idx, stripe_idx)) - continue; - - o = lod_qos_declare_object_on(env, m, ost_idx, th); - if (IS_ERR(o)) { - CDEBUG(D_OTHER, "can't declare new object on #%u: %d\n", - ost_idx, (int) PTR_ERR(o)); - rc = PTR_ERR(o); - continue; - } - - /* - * We've successfully declared (reserved) an object - */ - lod_qos_ost_in_use(env, stripe_idx, ost_idx); - stripe[stripe_idx] = o; - stripe_idx++; - + spin_unlock(&lqr->lqr_alloc); + rc = lod_check_and_reserve_ost(env, m, sfs, ost_idx, speed, + &stripe_idx, stripe, th); + spin_lock(&lqr->lqr_alloc); } if ((speed < 2) && (stripe_idx < stripe_cnt_min)) { /* Try again, allowing slower OSCs */ @@ -995,6 +1020,7 @@ repeat_find: goto repeat_find; } + spin_unlock(&lqr->lqr_alloc); up_read(&m->lod_qos.lq_rw_sem); if (stripe_idx) { -- 1.8.3.1