Whamcloud - gitweb
LU-11213 lmv: reuse object alloc QoS code from LOD 57/34657/15
authorLai Siyao <lai.siyao@whamcloud.com>
Fri, 22 Mar 2019 00:22:37 +0000 (08:22 +0800)
committerOleg Drokin <green@whamcloud.com>
Thu, 13 Jun 2019 04:16:54 +0000 (04:16 +0000)
Reuse the same object alloc QoS code as LOD, but the QoS code is
not moved to lower layer module, instead it's copied to LMV, because
it involves almost all LMV code, which is too big a change and should
be done separately in the future.

And for LMV round-robin object allocation, because we only need to
allocate one object, use the MDT index saved and update it to next
MDT.

Add sanity 413b.

Signed-off-by: Lai Siyao <lai.siyao@whamcloud.com>
Change-Id: I53c3d863dafda534eebb6b95da205b395071cd25
Reviewed-on: https://review.whamcloud.com/34657
Tested-by: Jenkins
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
15 files changed:
lustre/include/lu_object.h
lustre/include/obd.h
lustre/include/uapi/linux/lustre/lustre_idl.h
lustre/lmv/Makefile.in
lustre/lmv/lmv_intent.c
lustre/lmv/lmv_internal.h
lustre/lmv/lmv_obd.c
lustre/lmv/lmv_qos.c [new file with mode: 0644]
lustre/lmv/lproc_lmv.c
lustre/lod/lod_internal.h
lustre/lod/lod_lov.c
lustre/lod/lod_qos.c
lustre/obdclass/Makefile.in
lustre/obdclass/lu_qos.c [new file with mode: 0644]
lustre/tests/sanity.sh [changed mode: 0755->0644]

index 3c04907..86ed326 100644 (file)
@@ -1387,5 +1387,89 @@ static inline bool lu_object_is_cl(const struct lu_object *o)
        return lu_device_is_cl(o->lo_dev);
 }
 
+/* Generic subset of OSTs */
+struct ost_pool {
+       __u32              *op_array;   /* array of index of
+                                        * lov_obd->lov_tgts */
+       unsigned int        op_count;   /* number of OSTs in the array */
+       unsigned int        op_size;    /* allocated size of lp_array */
+       struct rw_semaphore op_rw_sem;  /* to protect ost_pool use */
+};
+
+/* round-robin QoS data for LOD/LMV */
+struct lu_qos_rr {
+       spinlock_t               lqr_alloc;     /* protect allocation index */
+       __u32                    lqr_start_idx; /* start index of new inode */
+       __u32                    lqr_offset_idx;/* aliasing for start_idx */
+       int                      lqr_start_count;/* reseed counter */
+       struct ost_pool          lqr_pool;      /* round-robin optimized list */
+       unsigned long            lqr_dirty:1;   /* recalc round-robin list */
+};
+
+/* QoS data per MDS/OSS */
+struct lu_svr_qos {
+       struct obd_uuid          lsq_uuid;      /* ptlrpc's c_remote_uuid */
+       struct list_head         lsq_svr_list;  /* link to lq_svr_list */
+       __u64                    lsq_bavail;    /* total bytes avail on svr */
+       __u64                    lsq_iavail;    /* tital inode avail on svr */
+       __u64                    lsq_penalty;   /* current penalty */
+       __u64                    lsq_penalty_per_obj; /* penalty decrease
+                                                      * every obj*/
+       time64_t                 lsq_used;      /* last used time, seconds */
+       __u32                    lsq_tgt_count; /* number of tgts on this svr */
+       __u32                    lsq_id;        /* unique svr id */
+};
+
+/* QoS data per MDT/OST */
+struct lu_tgt_qos {
+       struct lu_svr_qos       *ltq_svr;       /* svr info */
+       __u64                    ltq_penalty;   /* current penalty */
+       __u64                    ltq_penalty_per_obj; /* penalty decrease
+                                                      * every obj*/
+       __u64                    ltq_weight;    /* net weighting */
+       time64_t                 ltq_used;      /* last used time, seconds */
+       bool                     ltq_usable:1;  /* usable for striping */
+};
+
+/* target descriptor */
+struct lu_tgt_desc {
+       union {
+               struct dt_device        *ltd_tgt;
+               struct obd_device       *ltd_obd;
+       };
+       struct obd_export *ltd_exp;
+       struct obd_uuid    ltd_uuid;
+       __u32              ltd_index;
+       __u32              ltd_gen;
+       struct list_head   ltd_kill;
+       struct ptlrpc_thread    *ltd_recovery_thread;
+       struct mutex       ltd_fid_mutex;
+       struct lu_tgt_qos  ltd_qos; /* qos info per target */
+       struct obd_statfs  ltd_statfs;
+       time64_t           ltd_statfs_age;
+       unsigned long      ltd_active:1,/* is this target up for requests */
+                          ltd_activate:1,/* should target be activated */
+                          ltd_reap:1,  /* should this target be deleted */
+                          ltd_got_update_log:1, /* Already got update log */
+                          ltd_connecting:1; /* target is connecting */
+};
+
+/* QoS data for LOD/LMV */
+struct lu_qos {
+       struct list_head         lq_svr_list;   /* lu_svr_qos list */
+       struct rw_semaphore      lq_rw_sem;
+       __u32                    lq_active_svr_count;
+       unsigned int             lq_prio_free;   /* priority for free space */
+       unsigned int             lq_threshold_rr;/* priority for rr */
+       struct lu_qos_rr         lq_rr;          /* round robin qos data */
+       unsigned long            lq_dirty:1,     /* recalc qos data */
+                                lq_same_space:1,/* the servers all have approx.
+                                                 * the same space avail */
+                                lq_reset:1;     /* zero current penalties */
+};
+
+int lqos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
+int lqos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd);
+
 /** @} lu */
 #endif /* __LUSTRE_LU_OBJECT_H */
index b566809..cc15c9f 100644 (file)
@@ -88,7 +88,7 @@ struct obd_info {
        /* OBD_STATFS_* flags */
        __u64                   oi_flags;
        struct obd_device      *oi_obd;
-       struct lmv_tgt_desc    *oi_tgt;
+       struct lu_tgt_desc     *oi_tgt;
         /* statfs data specific for every OSC, if needed at all. */
         struct obd_statfs      *oi_osfs;
         /* An update callback which is called to update some data on upper
@@ -379,29 +379,10 @@ struct echo_client_obd {
        __u64                   ec_unique;
 };
 
-/* Generic subset of OSTs */
-struct ost_pool {
-        __u32              *op_array;      /* array of index of
-                                                   lov_obd->lov_tgts */
-        unsigned int        op_count;      /* number of OSTs in the array */
-        unsigned int        op_size;       /* allocated size of lp_array */
-       struct rw_semaphore op_rw_sem;     /* to protect ost_pool use */
-};
-
 /* allow statfs data caching for 1 second */
 #define OBD_STATFS_CACHE_SECONDS 1
 
-struct lov_tgt_desc {
-       struct list_head    ltd_kill;
-        struct obd_uuid     ltd_uuid;
-        struct obd_device  *ltd_obd;
-        struct obd_export  *ltd_exp;
-        __u32               ltd_gen;
-        __u32               ltd_index;   /* index in lov_obd->tgts */
-        unsigned long       ltd_active:1,/* is this target up for requests */
-                            ltd_activate:1,/* should  target be activated */
-                            ltd_reap:1;  /* should this target be deleted */
-};
+#define lov_tgt_desc lu_tgt_desc
 
 struct lov_md_tgt_desc {
        struct obd_device *lmtd_mdc;
@@ -436,16 +417,7 @@ struct lov_obd {
        struct kobject          *lov_tgts_kobj;
 };
 
-struct lmv_tgt_desc {
-       struct obd_uuid         ltd_uuid;
-       struct obd_device       *ltd_obd;
-       struct obd_export       *ltd_exp;
-       __u32                   ltd_idx;
-       struct mutex            ltd_fid_mutex;
-       struct obd_statfs       ltd_statfs;
-       time64_t                ltd_statfs_age;
-       unsigned long           ltd_active:1; /* target up for requests */
-};
+#define lmv_tgt_desc lu_tgt_desc
 
 struct lmv_obd {
        struct lu_client_fld    lmv_fld;
@@ -464,6 +436,9 @@ struct lmv_obd {
        struct obd_connect_data conn_data;
        struct kobject          *lmv_tgts_kobj;
        void                    *lmv_cache;
+
+       struct lu_qos           lmv_qos;
+       __u32                   lmv_qos_rr_index;
 };
 
 /* Minimum sector size is 512 */
index 9557039..e628f43 100644 (file)
@@ -2162,6 +2162,8 @@ struct mdt_rec_reint {
        __u16           rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */
 };
 
+#define LMV_DESC_QOS_MAXAGE_DEFAULT 60  /* Seconds */
+
 /* lmv structures */
 struct lmv_desc {
        __u32 ld_tgt_count;             /* how many MDS's */
index f03d419..4fca0f6 100644 (file)
@@ -1,4 +1,4 @@
 MODULES := lmv
-lmv-objs := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o
+lmv-objs := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o lmv_qos.o
 
 @INCLUDE_RULES@
index 2ee7aa9..a7a1903 100644 (file)
@@ -106,7 +106,7 @@ static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it,
 
        op_data->op_bias = MDS_CROSS_REF;
        CDEBUG(D_INODE, "REMOTE_INTENT with fid="DFID" -> mds #%u\n",
-              PFID(&body->mbo_fid1), tgt->ltd_idx);
+              PFID(&body->mbo_fid1), tgt->ltd_index);
 
        /* ask for security context upon intent */
        if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_OPEN) &&
@@ -206,7 +206,7 @@ int lmv_revalidate_slaves(struct obd_export *exp,
                        GOTO(cleanup, rc = PTR_ERR(tgt));
 
                CDEBUG(D_INODE, "Revalidate slave "DFID" -> mds #%u\n",
-                      PFID(&fid), tgt->ltd_idx);
+                      PFID(&fid), tgt->ltd_index);
 
                if (req != NULL) {
                        ptlrpc_req_finished(req);
@@ -350,7 +350,7 @@ retry:
                if (IS_ERR(tgt))
                        RETURN(PTR_ERR(tgt));
 
-               op_data->op_mds = tgt->ltd_idx;
+               op_data->op_mds = tgt->ltd_index;
        } else {
                LASSERT(fid_is_sane(&op_data->op_fid1));
                LASSERT(fid_is_zero(&op_data->op_fid2));
@@ -375,7 +375,7 @@ retry:
 
        CDEBUG(D_INODE, "OPEN_INTENT with fid1="DFID", fid2="DFID","
               " name='%s' -> mds #%u\n", PFID(&op_data->op_fid1),
-              PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx);
+              PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_index);
 
        rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking,
                            extra_lock_flags);
@@ -460,7 +460,7 @@ retry:
               ", name='%s' -> mds #%u\n",
               PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
               op_data->op_name ? op_data->op_name : "<NULL>",
-              tgt->ltd_idx);
+              tgt->ltd_index);
 
        op_data->op_bias &= ~MDS_CROSS_REF;
 
index 7017e94..2589f9f 100644 (file)
@@ -60,6 +60,8 @@ int lmv_revalidate_slaves(struct obd_export *exp,
 
 int lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
                     struct ptlrpc_request **preq);
+void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
+                        int activate);
 
 int lmv_statfs_check_update(struct obd_device *obd, struct lmv_tgt_desc *tgt);
 
@@ -77,7 +79,7 @@ lmv_get_target(struct lmv_obd *lmv, u32 mdt_idx, int *index)
                if (lmv->tgts[i] == NULL)
                        continue;
 
-               if (lmv->tgts[i]->ltd_idx == mdt_idx) {
+               if (lmv->tgts[i]->ltd_index == mdt_idx) {
                        if (index != NULL)
                                *index = i;
                        return lmv->tgts[i];
@@ -192,6 +194,10 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data)
 struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
                                    struct md_op_data *op_data);
 
+/* lmv_qos.c */
+struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt);
+struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt);
+
 /* lproc_lmv.c */
 int lmv_tunables_init(struct obd_device *obd);
 
index 440bceb..3d73e22 100644 (file)
@@ -60,9 +60,8 @@
 
 static int lmv_check_connect(struct obd_device *obd);
 
-static void lmv_activate_target(struct lmv_obd *lmv,
-                                struct lmv_tgt_desc *tgt,
-                                int activate)
+void lmv_activate_target(struct lmv_obd *lmv, struct lmv_tgt_desc *tgt,
+                        int activate)
 {
         if (tgt->ltd_active == activate)
                 return;
@@ -294,21 +293,21 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
         int                      rc;
         ENTRY;
 
-        mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
-                                        &obd->obd_uuid);
-        if (!mdc_obd) {
-                CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
-                RETURN(-EINVAL);
-        }
+       mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
+                                       &obd->obd_uuid);
+       if (!mdc_obd) {
+               CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
+               RETURN(-EINVAL);
+       }
 
        CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s\n",
               mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
               tgt->ltd_uuid.uuid, obd->obd_uuid.uuid);
 
-        if (!mdc_obd->obd_set_up) {
-                CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
-                RETURN(-EINVAL);
-        }
+       if (!mdc_obd->obd_set_up) {
+               CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
+               RETURN(-EINVAL);
+       }
 
        rc = obd_connect(NULL, &mdc_exp, mdc_obd, &obd->obd_uuid,
                         &lmv->conn_data, lmv->lmv_cache);
@@ -324,19 +323,19 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
        if (rc)
                RETURN(rc);
 
-        target.ft_srv = NULL;
-        target.ft_exp = mdc_exp;
-        target.ft_idx = tgt->ltd_idx;
+       target.ft_srv = NULL;
+       target.ft_exp = mdc_exp;
+       target.ft_idx = tgt->ltd_index;
 
-        fld_client_add_target(&lmv->lmv_fld, &target);
+       fld_client_add_target(&lmv->lmv_fld, &target);
 
-        rc = obd_register_observer(mdc_obd, obd);
-        if (rc) {
-                obd_disconnect(mdc_exp);
-                CERROR("target %s register_observer error %d\n",
-                       tgt->ltd_uuid.uuid, rc);
-                RETURN(rc);
-        }
+       rc = obd_register_observer(mdc_obd, obd);
+       if (rc) {
+               obd_disconnect(mdc_exp);
+               CERROR("target %s register_observer error %d\n",
+                      tgt->ltd_uuid.uuid, rc);
+               RETURN(rc);
+       }
 
        if (obd->obd_observer) {
                /*
@@ -356,6 +355,12 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
 
        md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize);
 
+       rc = lqos_add_tgt(&lmv->lmv_qos, tgt);
+       if (rc) {
+               obd_disconnect(mdc_exp);
+               RETURN(rc);
+       }
+
        CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
                mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
                atomic_read(&obd->obd_refcount));
@@ -375,6 +380,8 @@ static void lmv_del_target(struct lmv_obd *lmv, int index)
        if (lmv->tgts[index] == NULL)
                return;
 
+       lqos_del_tgt(&lmv->lmv_qos, lmv->tgts[index]);
+
        OBD_FREE_PTR(lmv->tgts[index]);
        lmv->tgts[index] = NULL;
        return;
@@ -384,11 +391,12 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                           __u32 index, int gen)
 {
        struct obd_device *mdc_obd;
-        struct lmv_obd      *lmv = &obd->u.lmv;
-        struct lmv_tgt_desc *tgt;
-       int                  orig_tgt_count = 0;
-        int                  rc = 0;
-        ENTRY;
+       struct lmv_obd *lmv = &obd->u.lmv;
+       struct lmv_tgt_desc *tgt;
+       int orig_tgt_count = 0;
+       int rc = 0;
+
+       ENTRY;
 
        CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
        mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
@@ -447,7 +455,7 @@ static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
        }
 
        mutex_init(&tgt->ltd_fid_mutex);
-       tgt->ltd_idx = index;
+       tgt->ltd_index = index;
        tgt->ltd_uuid = *uuidp;
        tgt->ltd_active = 0;
        lmv->tgts[index] = tgt;
@@ -1111,7 +1119,7 @@ hsm_req_err:
                        RETURN(-EINVAL);
 
                /* only files on same MDT can have their layouts swapped */
-               if (tgt1->ltd_idx != tgt2->ltd_idx)
+               if (tgt1->ltd_index != tgt2->ltd_index)
                        RETURN(-EPERM);
 
                rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
@@ -1264,9 +1272,12 @@ int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
 
 static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
-       struct lmv_obd  *lmv = &obd->u.lmv;
+       struct lmv_obd *lmv = &obd->u.lmv;
        struct lmv_desc *desc;
-       int             rc;
+       struct lnet_process_id lnet_id;
+       int i = 0;
+       int rc;
+
        ENTRY;
 
         if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
@@ -1289,13 +1300,35 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
        obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
        lmv->desc.ld_tgt_count = 0;
        lmv->desc.ld_active_tgt_count = 0;
-       lmv->desc.ld_qos_maxage = 60;
+       lmv->desc.ld_qos_maxage = LMV_DESC_QOS_MAXAGE_DEFAULT;
        lmv->max_def_easize = 0;
        lmv->max_easize = 0;
 
        spin_lock_init(&lmv->lmv_lock);
        mutex_init(&lmv->lmv_init_mutex);
 
+       /* Set up allocation policy (QoS and RR) */
+       INIT_LIST_HEAD(&lmv->lmv_qos.lq_svr_list);
+       init_rwsem(&lmv->lmv_qos.lq_rw_sem);
+       lmv->lmv_qos.lq_dirty = 1;
+       lmv->lmv_qos.lq_rr.lqr_dirty = 1;
+       lmv->lmv_qos.lq_reset = 1;
+       /* Default priority is toward free space balance */
+       lmv->lmv_qos.lq_prio_free = 232;
+       /* Default threshold for rr (roughly 17%) */
+       lmv->lmv_qos.lq_threshold_rr = 43;
+
+       /*
+        * initialize rr_index to lower 32bit of netid, so that client
+        * can distribute subdirs evenly from the beginning.
+        */
+       while (LNetGetId(i++, &lnet_id) != -ENOENT) {
+               if (LNET_NETTYP(LNET_NIDNET(lnet_id.nid)) != LOLND) {
+                       lmv->lmv_qos_rr_index = (u32)lnet_id.nid;
+                       break;
+               }
+       }
+
        rc = lmv_tunables_init(obd);
        if (rc)
                CWARN("%s: error adding LMV sysfs/debugfs files: rc = %d\n",
@@ -1472,6 +1505,7 @@ static int lmv_statfs_update(void *cookie, int rc)
                tgt->ltd_statfs = *osfs;
                tgt->ltd_statfs_age = ktime_get_seconds();
                spin_unlock(&lmv->lmv_lock);
+               lmv->lmv_qos.lq_dirty = 1;
        }
 
        return rc;
@@ -1563,7 +1597,7 @@ static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
                 RETURN(PTR_ERR(tgt));
 
        if (op_data->op_flags & MF_GET_MDT_IDX) {
-               op_data->op_mds = tgt->ltd_idx;
+               op_data->op_mds = tgt->ltd_index;
                RETURN(0);
        }
 
@@ -1613,17 +1647,6 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
         RETURN(rc);
 }
 
-static struct lmv_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
-{
-       static unsigned int rr_index;
-
-       /* locate MDT round-robin is the first step */
-       *mdt = rr_index % lmv->tgts_size;
-       rr_index++;
-
-       return lmv->tgts[*mdt];
-}
-
 static struct lmv_tgt_desc *
 lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
                       const char *name, int namelen, struct lu_fid *fid,
@@ -1637,7 +1660,7 @@ lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm,
                if (IS_ERR(tgt))
                        return tgt;
 
-               *mds = tgt->ltd_idx;
+               *mds = tgt->ltd_index;
                return tgt;
        }
 
@@ -1724,12 +1747,18 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data)
                   lmv_dir_space_hashed(op_data->op_default_mea1) &&
                   !lmv_dir_striped(lsm)) {
                tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds);
+               if (tgt == ERR_PTR(-EAGAIN))
+                       tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds);
                /*
                 * only update statfs when mkdir under dir with "space" hash,
                 * this means the cached statfs may be stale, and current mkdir
                 * may not follow QoS accurately, but it's not serious, and it
                 * avoids periodic statfs when client doesn't mkdir under
                 * "space" hashed directories.
+                *
+                * TODO: after MDT support QoS object allocation, also update
+                * statfs for 'lfs mkdir -i -1 ...", currently it's done in user
+                * space.
                 */
                if (!IS_ERR(tgt)) {
                        struct obd_device *obd;
@@ -1849,7 +1878,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
                if (IS_ERR(tgt))
                        RETURN(PTR_ERR(tgt));
 
-               op_data->op_mds = tgt->ltd_idx;
+               op_data->op_mds = tgt->ltd_index;
        }
 
        CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n",
@@ -1884,7 +1913,7 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
                RETURN(PTR_ERR(tgt));
 
        CDEBUG(D_INODE, "ENQUEUE on "DFID" -> mds #%u\n",
-              PFID(&op_data->op_fid1), tgt->ltd_idx);
+              PFID(&op_data->op_fid1), tgt->ltd_index);
 
        rc = md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh,
                        extra_lock_flags);
@@ -1911,7 +1940,7 @@ retry:
 
        CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
                (int)op_data->op_namelen, op_data->op_name,
-               PFID(&op_data->op_fid1), tgt->ltd_idx);
+               PFID(&op_data->op_fid1), tgt->ltd_index);
 
        rc = md_getattr_name(tgt->ltd_exp, op_data, preq);
        if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
@@ -1967,7 +1996,7 @@ static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt,
                        RETURN(PTR_ERR(tgt));
        }
 
-       if (tgt->ltd_idx != op_tgt) {
+       if (tgt->ltd_index != op_tgt) {
                CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
                policy.l_inodebits.bits = bits;
                rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
@@ -2014,7 +2043,7 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
         * Cancel UPDATE lock on child (fid1).
         */
        op_data->op_flags |= MF_MDC_CANCEL_FID2;
-       rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
+       rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX,
                              MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
        if (rc != 0)
                RETURN(rc);
@@ -2112,7 +2141,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
                RETURN(PTR_ERR(child_tgt));
 
        if (!S_ISDIR(op_data->op_mode) && tp_tgt)
-               rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_idx);
+               rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_index);
        else
                rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data);
        if (rc)
@@ -2138,7 +2167,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
        }
 
        /* cancel UPDATE lock of parent master object */
-       rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, LCK_EX,
+       rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index, LCK_EX,
                              MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
        if (rc)
                RETURN(rc);
@@ -2163,14 +2192,14 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
        op_data->op_fid4 = target_fid;
 
        /* cancel UPDATE locks of target parent */
-       rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+       rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX,
                              MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
        if (rc)
                RETURN(rc);
 
        /* cancel LOOKUP lock of source if source is remote object */
        if (child_tgt != sp_tgt) {
-               rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx,
+               rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index,
                                      LCK_EX, MDS_INODELOCK_LOOKUP,
                                      MF_MDC_CANCEL_FID3);
                if (rc)
@@ -2178,7 +2207,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data,
        }
 
        /* cancel ELC locks of source */
-       rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_idx, LCK_EX,
+       rc = lmv_early_cancel(exp, child_tgt, op_data, tgt->ltd_index, LCK_EX,
                              MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
        if (rc)
                RETURN(rc);
@@ -2238,7 +2267,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
        op_data->op_flags |= MF_MDC_CANCEL_FID4;
 
        /* cancel UPDATE locks of target parent */
-       rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+       rc = lmv_early_cancel(exp, tp_tgt, op_data, tgt->ltd_index, LCK_EX,
                              MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2);
        if (rc != 0)
                RETURN(rc);
@@ -2247,7 +2276,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
                /* cancel LOOKUP lock of target on target parent */
                if (tgt != tp_tgt) {
                        rc = lmv_early_cancel(exp, tp_tgt, op_data,
-                                             tgt->ltd_idx, LCK_EX,
+                                             tgt->ltd_index, LCK_EX,
                                              MDS_INODELOCK_LOOKUP,
                                              MF_MDC_CANCEL_FID4);
                        if (rc != 0)
@@ -2261,7 +2290,7 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
                        RETURN(PTR_ERR(src_tgt));
 
                /* cancel ELC locks of source */
-               rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_idx,
+               rc = lmv_early_cancel(exp, src_tgt, op_data, tgt->ltd_index,
                                      LCK_EX, MDS_INODELOCK_ELC,
                                      MF_MDC_CANCEL_FID3);
                if (rc != 0)
@@ -2276,7 +2305,7 @@ retry:
                RETURN(PTR_ERR(sp_tgt));
 
        /* cancel UPDATE locks of source parent */
-       rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_idx, LCK_EX,
+       rc = lmv_early_cancel(exp, sp_tgt, op_data, tgt->ltd_index, LCK_EX,
                              MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
        if (rc != 0)
                RETURN(rc);
@@ -2285,7 +2314,7 @@ retry:
                /* cancel LOOKUP lock of source on source parent */
                if (src_tgt != sp_tgt) {
                        rc = lmv_early_cancel(exp, sp_tgt, op_data,
-                                             tgt->ltd_idx, LCK_EX,
+                                             tgt->ltd_index, LCK_EX,
                                              MDS_INODELOCK_LOOKUP,
                                              MF_MDC_CANCEL_FID3);
                        if (rc != 0)
@@ -2330,7 +2359,7 @@ rename:
                /* cancel LOOKUP lock of target on target parent */
                if (tgt != tp_tgt) {
                        rc = lmv_early_cancel(exp, tp_tgt, op_data,
-                                             tgt->ltd_idx, LCK_EX,
+                                             tgt->ltd_index, LCK_EX,
                                              MDS_INODELOCK_LOOKUP,
                                              MF_MDC_CANCEL_FID4);
                        if (rc != 0)
@@ -2828,17 +2857,18 @@ retry:
        op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
 
        if (parent_tgt != tgt)
-               rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx,
+               rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_index,
                                      LCK_EX, MDS_INODELOCK_LOOKUP,
                                      MF_MDC_CANCEL_FID3);
 
-       rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX,
+       rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_index, LCK_EX,
                              MDS_INODELOCK_ELC, MF_MDC_CANCEL_FID3);
        if (rc)
                RETURN(rc);
 
        CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%u\n",
-              PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
+              PFID(&op_data->op_fid1), PFID(&op_data->op_fid2),
+              tgt->ltd_index);
 
        rc = md_unlink(tgt->ltd_exp, op_data, request);
        if (rc == -ENOENT && lmv_dir_retry_check_update(op_data)) {
diff --git a/lustre/lmv/lmv_qos.c b/lustre/lmv/lmv_qos.c
new file mode 100644 (file)
index 0000000..685cc01
--- /dev/null
@@ -0,0 +1,446 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/lmv/lmv_qos.c
+ *
+ * LMV QoS.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for object allocation QoS
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LMV
+
+#include <asm/div64.h>
+#include <libcfs/libcfs.h>
+#include <uapi/linux/lustre/lustre_idl.h>
+#include <lustre_swab.h>
+#include <obd_class.h>
+
+#include "lmv_internal.h"
+
+static inline __u64 tgt_statfs_bavail(struct lu_tgt_desc *tgt)
+{
+       struct obd_statfs *statfs = &tgt->ltd_statfs;
+
+       return statfs->os_bavail * statfs->os_bsize;
+}
+
+static inline __u64 tgt_statfs_iavail(struct lu_tgt_desc *tgt)
+{
+       return tgt->ltd_statfs.os_ffree;
+}
+
+/**
+ * Calculate penalties per-tgt and per-server
+ *
+ * Re-calculate penalties when the configuration changes, active targets
+ * change and after statfs refresh (all these are reflected by lq_dirty flag).
+ * On every MDT and MDS: decay the penalty by half for every 8x the update
+ * interval that the device has been idle. That gives lots of time for the
+ * statfs information to be updated (which the penalty is only a proxy for),
+ * and avoids penalizing MDS/MDTs under light load.
+ * See lmv_qos_calc_weight() for how penalties are factored into the weight.
+ *
+ * \param[in] lmv      LMV device
+ *
+ * \retval 0           on success
+ * \retval -EAGAIN     the number of MDTs isn't enough or all MDT spaces are
+ *                     almost the same
+ */
+static int lmv_qos_calc_ppts(struct lmv_obd *lmv)
+{
+       struct lu_qos *qos = &lmv->lmv_qos;
+       struct lu_tgt_desc *tgt;
+       struct lu_svr_qos *svr;
+       __u64 ba_max, ba_min, ba;
+       __u64 ia_max, ia_min, ia;
+       __u32 num_active;
+       unsigned int i;
+       int prio_wide;
+       time64_t now, age;
+       __u32 maxage = lmv->desc.ld_qos_maxage;
+       int rc;
+
+       ENTRY;
+
+       if (!qos->lq_dirty)
+               GOTO(out, rc = 0);
+
+       num_active = lmv->desc.ld_active_tgt_count;
+       if (num_active < 2)
+               GOTO(out, rc = -EAGAIN);
+
+       /* find bavail on each server */
+       list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+               svr->lsq_bavail = 0;
+               svr->lsq_iavail = 0;
+       }
+       qos->lq_active_svr_count = 0;
+
+       /*
+        * How badly user wants to select targets "widely" (not recently chosen
+        * and not on recent MDS's).  As opposed to "freely" (free space avail.)
+        * 0-256
+        */
+       prio_wide = 256 - qos->lq_prio_free;
+
+       ba_min = (__u64)(-1);
+       ba_max = 0;
+       ia_min = (__u64)(-1);
+       ia_max = 0;
+       now = ktime_get_real_seconds();
+
+       /* Calculate server penalty per object */
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+               if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+                       continue;
+
+               /* bavail >> 16 to avoid overflow */
+               ba = tgt_statfs_bavail(tgt) >> 16;
+               if (!ba)
+                       continue;
+
+               ba_min = min(ba, ba_min);
+               ba_max = max(ba, ba_max);
+
+               /* iavail >> 8 to avoid overflow */
+               ia = tgt_statfs_iavail(tgt) >> 8;
+               if (!ia)
+                       continue;
+
+               ia_min = min(ia, ia_min);
+               ia_max = max(ia, ia_max);
+
+               /* Count the number of usable MDS's */
+               if (tgt->ltd_qos.ltq_svr->lsq_bavail == 0)
+                       qos->lq_active_svr_count++;
+               tgt->ltd_qos.ltq_svr->lsq_bavail += ba;
+               tgt->ltd_qos.ltq_svr->lsq_iavail += ia;
+
+               /*
+                * per-MDT penalty is
+                * prio * bavail * iavail / (num_tgt - 1) / 2
+                */
+               tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia;
+               do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active - 1);
+               tgt->ltd_qos.ltq_penalty_per_obj >>= 1;
+
+               age = (now - tgt->ltd_qos.ltq_used) >> 3;
+               if (qos->lq_reset || age > 32 * maxage)
+                       tgt->ltd_qos.ltq_penalty = 0;
+               else if (age > maxage)
+                       /* Decay tgt penalty. */
+                       tgt->ltd_qos.ltq_penalty >>= (age / maxage);
+       }
+
+       num_active = qos->lq_active_svr_count;
+       if (num_active < 2) {
+               /*
+                * If there's only 1 MDS, we can't penalize it, so instead
+                * we have to double the MDT penalty
+                */
+               num_active = 2;
+               for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+                       tgt = lmv->tgts[i];
+                       if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+                               continue;
+
+                       tgt->ltd_qos.ltq_penalty_per_obj <<= 1;
+               }
+       }
+
+       /*
+        * Per-MDS penalty is
+        * prio * bavail * iavail / server_tgts / (num_svr - 1) / 2
+        */
+       list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+               ba = svr->lsq_bavail;
+               ia = svr->lsq_iavail;
+               svr->lsq_penalty_per_obj = prio_wide * ba  * ia;
+               do_div(ba, svr->lsq_tgt_count * (num_active - 1));
+               svr->lsq_penalty_per_obj >>= 1;
+
+               age = (now - svr->lsq_used) >> 3;
+               if (qos->lq_reset || age > 32 * maxage)
+                       svr->lsq_penalty = 0;
+               else if (age > maxage)
+                       /* Decay server penalty. */
+                       svr->lsq_penalty >>= age / maxage;
+       }
+
+       qos->lq_dirty = 0;
+       qos->lq_reset = 0;
+
+       /*
+        * If each MDT has almost same free space, do rr allocation for better
+        * creation performance
+        */
+       qos->lq_same_space = 0;
+       if ((ba_max * (256 - qos->lq_threshold_rr)) >> 8 < ba_min &&
+           (ia_max * (256 - qos->lq_threshold_rr)) >> 8 < ia_min) {
+               qos->lq_same_space = 1;
+               /* Reset weights for the next time we enter qos mode */
+               qos->lq_reset = 1;
+       }
+       rc = 0;
+
+out:
+       if (!rc && qos->lq_same_space)
+               RETURN(-EAGAIN);
+
+       RETURN(rc);
+}
+
+static inline bool lmv_qos_is_usable(struct lmv_obd *lmv)
+{
+       if (!lmv->lmv_qos.lq_dirty && lmv->lmv_qos.lq_same_space)
+               return false;
+
+       if (lmv->desc.ld_active_tgt_count < 2)
+               return false;
+
+       return true;
+}
+
+/**
+ * Calculate weight for a given MDT.
+ *
+ * The final MDT weight is bavail >> 16 * iavail >> 8 minus the MDT and MDS
+ * penalties.  See lmv_qos_calc_ppts() for how penalties are calculated.
+ *
+ * \param[in] tgt      MDT target descriptor
+ */
+static void lmv_qos_calc_weight(struct lu_tgt_desc *tgt)
+{
+       struct lu_tgt_qos *ltq = &tgt->ltd_qos;
+       __u64 temp, temp2;
+
+       temp = (tgt_statfs_bavail(tgt) >> 16) * (tgt_statfs_iavail(tgt) >> 8);
+       temp2 = ltq->ltq_penalty + ltq->ltq_svr->lsq_penalty;
+       if (temp < temp2)
+               ltq->ltq_weight = 0;
+       else
+               ltq->ltq_weight = temp - temp2;
+}
+
+/**
+ * Re-calculate weights.
+ *
+ * The function is called when some target was used for a new object. In
+ * this case we should re-calculate all the weights to keep new allocations
+ * balanced well.
+ *
+ * \param[in] lmv      LMV device
+ * \param[in] tgt      target where a new object was placed
+ * \param[out] total_wt        new total weight for the pool
+ *
+ * \retval             0
+ */
+static int lmv_qos_used(struct lmv_obd *lmv, struct lu_tgt_desc *tgt,
+                       __u64 *total_wt)
+{
+       struct lu_tgt_qos *ltq;
+       struct lu_svr_qos *svr;
+       unsigned int i;
+
+       ENTRY;
+
+       ltq = &tgt->ltd_qos;
+       LASSERT(ltq);
+
+       /* Don't allocate on this device anymore, until the next alloc_qos */
+       ltq->ltq_usable = 0;
+
+       svr = ltq->ltq_svr;
+
+       /*
+        * Decay old penalty by half (we're adding max penalty, and don't
+        * want it to run away.)
+        */
+       ltq->ltq_penalty >>= 1;
+       svr->lsq_penalty >>= 1;
+
+       /* mark the MDS and MDT as recently used */
+       ltq->ltq_used = svr->lsq_used = ktime_get_real_seconds();
+
+       /* Set max penalties for this MDT and MDS */
+       ltq->ltq_penalty += ltq->ltq_penalty_per_obj *
+                           lmv->desc.ld_active_tgt_count;
+       svr->lsq_penalty += svr->lsq_penalty_per_obj *
+               lmv->lmv_qos.lq_active_svr_count;
+
+       /* Decrease all MDS penalties */
+       list_for_each_entry(svr, &lmv->lmv_qos.lq_svr_list, lsq_svr_list) {
+               if (svr->lsq_penalty < svr->lsq_penalty_per_obj)
+                       svr->lsq_penalty = 0;
+               else
+                       svr->lsq_penalty -= svr->lsq_penalty_per_obj;
+       }
+
+       *total_wt = 0;
+       /* Decrease all MDT penalties */
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               ltq = &lmv->tgts[i]->ltd_qos;
+               if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
+                       continue;
+
+               if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj)
+                       ltq->ltq_penalty = 0;
+               else
+                       ltq->ltq_penalty -= ltq->ltq_penalty_per_obj;
+
+               lmv_qos_calc_weight(lmv->tgts[i]);
+
+               /* Recalc the total weight of usable osts */
+               if (ltq->ltq_usable)
+                       *total_wt += ltq->ltq_weight;
+
+               CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu"
+                         " tgtppo=%llu tgtp=%llu svrppo=%llu"
+                         " svrp=%llu wt=%llu\n",
+                         i, ltq->ltq_usable,
+                         tgt_statfs_bavail(tgt) >> 10,
+                         ltq->ltq_penalty_per_obj >> 10,
+                         ltq->ltq_penalty >> 10,
+                         ltq->ltq_svr->lsq_penalty_per_obj >> 10,
+                         ltq->ltq_svr->lsq_penalty >> 10,
+                         ltq->ltq_weight >> 10);
+       }
+
+       RETURN(0);
+}
+
+struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt)
+{
+       struct lu_tgt_desc *tgt;
+       __u64 total_weight = 0;
+       __u64 cur_weight = 0;
+       __u64 rand;
+       int i;
+       int rc;
+
+       ENTRY;
+
+       if (!lmv_qos_is_usable(lmv))
+               RETURN(ERR_PTR(-EAGAIN));
+
+       down_write(&lmv->lmv_qos.lq_rw_sem);
+
+       if (!lmv_qos_is_usable(lmv))
+               GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+
+       rc = lmv_qos_calc_ppts(lmv);
+       if (rc)
+               GOTO(unlock, tgt = ERR_PTR(rc));
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+               if (!tgt)
+                       continue;
+
+               tgt->ltd_qos.ltq_usable = 0;
+               if (!tgt->ltd_exp || !tgt->ltd_active)
+                       continue;
+
+               tgt->ltd_qos.ltq_usable = 1;
+               lmv_qos_calc_weight(tgt);
+               total_weight += tgt->ltd_qos.ltq_weight;
+       }
+
+       if (total_weight) {
+#if BITS_PER_LONG == 32
+               rand = cfs_rand() % (unsigned int)total_weight;
+               /*
+                * If total_weight > 32-bit, first generate the high
+                * 32 bits of the random number, then add in the low
+                * 32 bits (truncated to the upper limit, if needed)
+                */
+               if (total_weight > 0xffffffffULL)
+                       rand = (__u64)(cfs_rand() %
+                               (unsigned int)(total_weight >> 32)) << 32;
+               else
+                       rand = 0;
+
+               if (rand == (total_weight & 0xffffffff00000000ULL))
+                       rand |= cfs_rand() % (unsigned int)total_weight;
+               else
+                       rand |= cfs_rand();
+
+#else
+               rand = ((__u64)cfs_rand() << 32 | cfs_rand()) % total_weight;
+#endif
+       } else {
+               rand = 0;
+       }
+
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[i];
+
+               if (!tgt || !tgt->ltd_qos.ltq_usable)
+                       continue;
+
+               cur_weight += tgt->ltd_qos.ltq_weight;
+               if (cur_weight < rand)
+                       continue;
+
+               *mdt = tgt->ltd_index;
+               lmv_qos_used(lmv, tgt, &total_weight);
+               GOTO(unlock, rc = 0);
+       }
+
+       /* no proper target found */
+       GOTO(unlock, tgt = ERR_PTR(-EAGAIN));
+unlock:
+       up_write(&lmv->lmv_qos.lq_rw_sem);
+
+       return tgt;
+}
+
+struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt)
+{
+       struct lu_tgt_desc *tgt;
+       int i;
+
+       ENTRY;
+
+       spin_lock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+       for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
+               tgt = lmv->tgts[(i + lmv->lmv_qos_rr_index) %
+                               lmv->desc.ld_tgt_count];
+               if (tgt && tgt->ltd_exp && tgt->ltd_active) {
+                       *mdt = tgt->ltd_index;
+                       lmv->lmv_qos_rr_index =
+                               (i + lmv->lmv_qos_rr_index + 1) %
+                               lmv->desc.ld_tgt_count;
+                       spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+
+                       RETURN(tgt);
+               }
+       }
+       spin_unlock(&lmv->lmv_qos.lq_rr.lqr_alloc);
+
+       RETURN(ERR_PTR(-ENODEV));
+}
index 95fe927..981b032 100644 (file)
@@ -75,6 +75,109 @@ static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr,
 }
 LUSTRE_RO_ATTR(desc_uuid);
 
+static ssize_t qos_maxage_show(struct kobject *kobj,
+                              struct attribute *attr,
+                              char *buf)
+{
+       struct obd_device *dev = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+
+       return sprintf(buf, "%u\n", dev->u.lmv.desc.ld_qos_maxage);
+}
+
+static ssize_t qos_maxage_store(struct kobject *kobj,
+                               struct attribute *attr,
+                               const char *buffer,
+                               size_t count)
+{
+       struct obd_device *dev = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+       unsigned int val;
+       int rc;
+
+       rc = kstrtouint(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       dev->u.lmv.desc.ld_qos_maxage = val;
+
+       return count;
+}
+LUSTRE_RW_ATTR(qos_maxage);
+
+static ssize_t qos_prio_free_show(struct kobject *kobj,
+                                 struct attribute *attr,
+                                 char *buf)
+{
+       struct obd_device *dev = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+
+       return sprintf(buf, "%u%%\n",
+                      (dev->u.lmv.lmv_qos.lq_prio_free * 100 + 255) >> 8);
+}
+
+static ssize_t qos_prio_free_store(struct kobject *kobj,
+                                  struct attribute *attr,
+                                  const char *buffer,
+                                  size_t count)
+{
+       struct obd_device *dev = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+       struct lmv_obd *lmv = &dev->u.lmv;
+       unsigned int val;
+       int rc;
+
+       rc = kstrtouint(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       if (val > 100)
+               return -EINVAL;
+
+       lmv->lmv_qos.lq_prio_free = (val << 8) / 100;
+       lmv->lmv_qos.lq_dirty = 1;
+       lmv->lmv_qos.lq_reset = 1;
+
+       return count;
+}
+LUSTRE_RW_ATTR(qos_prio_free);
+
+static ssize_t qos_threshold_rr_show(struct kobject *kobj,
+                                    struct attribute *attr,
+                                    char *buf)
+{
+       struct obd_device *dev = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+
+       return sprintf(buf, "%u%%\n",
+                      (dev->u.lmv.lmv_qos.lq_threshold_rr * 100 + 255) >> 8);
+}
+
+static ssize_t qos_threshold_rr_store(struct kobject *kobj,
+                                     struct attribute *attr,
+                                     const char *buffer,
+                                     size_t count)
+{
+       struct obd_device *dev = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+       struct lmv_obd *lmv = &dev->u.lmv;
+       unsigned int val;
+       int rc;
+
+       rc = kstrtouint(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       if (val > 100)
+               return -EINVAL;
+
+       lmv->lmv_qos.lq_threshold_rr = (val << 8) / 100;
+       lmv->lmv_qos.lq_dirty = 1;
+
+       return count;
+}
+LUSTRE_RW_ATTR(qos_threshold_rr);
+
 #ifdef CONFIG_PROC_FS
 static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
@@ -117,7 +220,7 @@ static int lmv_tgt_seq_show(struct seq_file *p, void *v)
                return 0;
 
        seq_printf(p, "%u: %s %sACTIVE\n",
-                  tgt->ltd_idx, tgt->ltd_uuid.uuid,
+                  tgt->ltd_index, tgt->ltd_uuid.uuid,
                   tgt->ltd_active ? "" : "IN");
        return 0;
 }
@@ -156,6 +259,9 @@ static struct attribute *lmv_attrs[] = {
        &lustre_attr_activeobd.attr,
        &lustre_attr_desc_uuid.attr,
        &lustre_attr_numobd.attr,
+       &lustre_attr_qos_maxage.attr,
+       &lustre_attr_qos_prio_free.attr,
+       &lustre_attr_qos_threshold_rr.attr,
        NULL,
 };
 
index a9f5ffe..83ceeb8 100644 (file)
 #define LMVEA_DELETE_VALUES(count, offset)                             \
        ((count) == 0 && (offset) == (typeof(offset))(-1))
 
-struct lod_qos_rr {
-       spinlock_t               lqr_alloc;     /* protect allocation index */
-       __u32                    lqr_start_idx; /* start index of new inode */
-       __u32                    lqr_offset_idx;/* aliasing for start_idx */
-       int                      lqr_start_count;/* reseed counter */
-       struct ost_pool          lqr_pool;      /* round-robin optimized list */
-       unsigned long            lqr_dirty:1;   /* recalc round-robin list */
-};
-
 struct pool_desc {
        char                     pool_name[LOV_MAXPOOLNAME + 1];
        struct ost_pool          pool_obds;     /* pool members */
        atomic_t                 pool_refcount;
-       struct lod_qos_rr        pool_rr;
+       struct lu_qos_rr         pool_rr;
        struct hlist_node        pool_hash;     /* access by poolname */
        struct list_head         pool_list;
        struct proc_dir_entry   *pool_proc_entry;
@@ -78,57 +69,7 @@ struct pool_desc {
 #define pool_tgt_array(p)  ((p)->pool_obds.op_array)
 #define pool_tgt_rw_sem(p) ((p)->pool_obds.op_rw_sem)
 
-struct lod_qos {
-       struct list_head         lq_oss_list;
-       struct rw_semaphore      lq_rw_sem;
-       __u32                    lq_active_oss_count;
-       unsigned int             lq_prio_free;   /* priority for free space */
-       unsigned int             lq_threshold_rr;/* priority for rr */
-       struct lod_qos_rr        lq_rr;          /* round robin qos data */
-       bool                     lq_dirty:1,     /* recalc qos data */
-                                lq_same_space:1,/* the ost's all have approx.
-                                                   the same space avail */
-                                lq_reset:1;     /* zero current penalties */
-};
-
-struct lod_qos_oss {
-       struct obd_uuid          lqo_uuid;      /* ptlrpc's c_remote_uuid */
-       struct list_head         lqo_oss_list;  /* link to lov_qos */
-       __u64                    lqo_bavail;    /* total bytes avail on OSS */
-       __u64                    lqo_penalty;   /* current penalty */
-       __u64                    lqo_penalty_per_obj; /* penalty decrease
-                                                        every obj*/
-       time64_t                 lqo_used;      /* last used time, seconds */
-       __u32                    lqo_ost_count; /* number of osts on this oss */
-       __u32                    lqo_id;        /* unique oss id */
-};
-
-struct ltd_qos {
-       struct lod_qos_oss      *ltq_oss;       /* oss info */
-       __u64                    ltq_penalty;   /* current penalty */
-       __u64                    ltq_penalty_per_obj; /* penalty decrease
-                                                        every obj*/
-       __u64                    ltq_weight;    /* net weighting */
-       time64_t                 ltq_used;      /* last used time, seconds */
-       bool                     ltq_usable:1;  /* usable for striping */
-};
-
-struct lod_tgt_desc {
-       struct dt_device  *ltd_tgt;
-       struct list_head   ltd_kill;
-       struct obd_export *ltd_exp;
-       struct obd_uuid    ltd_uuid;
-       __u32              ltd_gen;
-       __u32              ltd_index;
-       struct ltd_qos     ltd_qos; /* qos info per target */
-       struct obd_statfs  ltd_statfs;
-       struct ptlrpc_thread    *ltd_recovery_thread;
-       unsigned long      ltd_active:1,/* is this target up for requests */
-                          ltd_activate:1,/* should  target be activated */
-                          ltd_reap:1,  /* should this target be deleted */
-                          ltd_got_update_log:1, /* Already got update log */
-                          ltd_connecting:1; /* target is connecting */
-};
+#define lod_tgt_desc   lu_tgt_desc
 
 #define TGT_PTRS               256     /* number of pointers at 1st level */
 #define TGT_PTRS_PER_BLOCK      256     /* number of pointers at 2nd level */
@@ -210,7 +151,7 @@ struct lod_device {
         * structure should be moved to lod_tgt_descs as well.
         */
        /* QoS info per LOD */
-       struct lod_qos        lod_qos; /* qos info per lod */
+       struct lu_qos         lod_qos; /* qos info per lod */
 
        /* OST pool data */
        struct ost_pool         lod_pool_info; /* all OSTs in a packed array */
@@ -233,14 +174,14 @@ struct lod_device {
 #define lod_ostnr      lod_ost_descs.ltd_tgtnr
 #define lod_osts_size  lod_ost_descs.ltd_tgts_size
 #define ltd_ost                ltd_tgt
-#define lod_ost_desc   lod_tgt_desc
+#define lod_ost_desc   lu_tgt_desc
 
 #define lod_mdts               lod_mdt_descs.ltd_tgts
 #define lod_mdt_bitmap         lod_mdt_descs.ltd_tgt_bitmap
 #define lod_remote_mdt_count   lod_mdt_descs.ltd_tgtnr
 #define lod_mdts_size          lod_mdt_descs.ltd_tgts_size
 #define ltd_mdt                        ltd_tgt
-#define lod_mdt_desc           lod_tgt_desc
+#define lod_mdt_desc           lu_tgt_desc
 
 struct lod_layout_component {
        struct lu_extent          llc_extent;
@@ -751,9 +692,7 @@ struct lod_obj_stripe_cb_data {
 int lod_prepare_create(const struct lu_env *env, struct lod_object *lo,
                       struct lu_attr *attr, const struct lu_buf *buf,
                       struct thandle *th);
-int qos_add_tgt(struct lod_device*, struct lod_tgt_desc *);
-int qos_del_tgt(struct lod_device *, struct lod_tgt_desc *);
-void lod_qos_rr_init(struct lod_qos_rr *lqr);
+void lod_qos_rr_init(struct lu_qos_rr *lqr);
 int lod_use_defined_striping(const struct lu_env *, struct lod_object *,
                             const struct lu_buf *);
 int lod_qos_parse_config(const struct lu_env *env, struct lod_object *lo,
index c711e39..b60700c 100644 (file)
@@ -111,7 +111,7 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd)
                        list_del(&tgt_desc->ltd_kill);
                        if (ltd == &lod->lod_ost_descs) {
                                /* remove from QoS structures */
-                               rc = qos_del_tgt(lod, tgt_desc);
+                               rc = lqos_del_tgt(&lod->lod_qos, tgt_desc);
                                if (rc)
                                        CERROR("%s: qos_del_tgt(%s) failed:"
                                               "rc = %d\n",
@@ -364,7 +364,7 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod,
                        GOTO(out_mutex, rc);
                }
 
-               rc = qos_add_tgt(lod, tgt_desc);
+               rc = lqos_add_tgt(&lod->lod_qos, tgt_desc);
                if (rc) {
                        CERROR("%s: qos_add_tgt failed with %d\n",
                                obd->obd_name, rc);
@@ -2192,7 +2192,7 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg)
        lod->lod_sp_me = LUSTRE_SP_CLI;
 
        /* Set up allocation policy (QoS and RR) */
-       INIT_LIST_HEAD(&lod->lod_qos.lq_oss_list);
+       INIT_LIST_HEAD(&lod->lod_qos.lq_svr_list);
        init_rwsem(&lod->lod_qos.lq_rw_sem);
        lod->lod_qos.lq_dirty = 1;
        lod->lod_qos.lq_rr.lqr_dirty = 1;
index 5433232..aca8fc7 100644 (file)
                       OST_TGT(lod,i)->ltd_statfs.os_bsize)
 
 /**
- * Add a new target to Quality of Service (QoS) target table.
- *
- * Add a new OST target to the structure representing an OSS. Resort the list
- * of known OSSs by the number of OSTs attached to each OSS. The OSS list is
- * protected internally and no external locking is required.
- *
- * \param[in] lod              LOD device
- * \param[in] ost_desc         OST description
- *
- * \retval 0                   on success
- * \retval -ENOMEM             on error
- */
-int qos_add_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
-{
-       struct lod_qos_oss *oss = NULL, *temposs;
-       struct obd_export  *exp = ost_desc->ltd_exp;
-       int                 rc = 0, found = 0;
-       struct list_head   *list;
-       __u32 id = 0;
-       ENTRY;
-
-       down_write(&lod->lod_qos.lq_rw_sem);
-       /*
-        * a bit hacky approach to learn NID of corresponding connection
-        * but there is no official API to access information like this
-        * with OSD API.
-        */
-       list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list) {
-               if (obd_uuid_equals(&oss->lqo_uuid,
-                                   &exp->exp_connection->c_remote_uuid)) {
-                       found++;
-                       break;
-               }
-               if (oss->lqo_id > id)
-                       id = oss->lqo_id;
-       }
-
-       if (!found) {
-               OBD_ALLOC_PTR(oss);
-               if (!oss)
-                       GOTO(out, rc = -ENOMEM);
-               memcpy(&oss->lqo_uuid, &exp->exp_connection->c_remote_uuid,
-                      sizeof(oss->lqo_uuid));
-               ++id;
-               oss->lqo_id = id;
-       } else {
-               /* Assume we have to move this one */
-               list_del(&oss->lqo_oss_list);
-       }
-
-       oss->lqo_ost_count++;
-       ost_desc->ltd_qos.ltq_oss = oss;
-
-       CDEBUG(D_QOS, "add tgt %s to OSS %s (%d OSTs)\n",
-              obd_uuid2str(&ost_desc->ltd_uuid), obd_uuid2str(&oss->lqo_uuid),
-              oss->lqo_ost_count);
-
-       /* Add sorted by # of OSTs.  Find the first entry that we're
-          bigger than... */
-       list = &lod->lod_qos.lq_oss_list;
-       list_for_each_entry(temposs, list, lqo_oss_list) {
-               if (oss->lqo_ost_count > temposs->lqo_ost_count)
-                       break;
-       }
-       /* ...and add before it.  If we're the first or smallest, temposs
-          points to the list head, and we add to the end. */
-       list_add_tail(&oss->lqo_oss_list, &temposs->lqo_oss_list);
-
-       lod->lod_qos.lq_dirty = 1;
-       lod->lod_qos.lq_rr.lqr_dirty = 1;
-
-out:
-       up_write(&lod->lod_qos.lq_rw_sem);
-       RETURN(rc);
-}
-
-/**
- * Remove OST target from QoS table.
- *
- * Removes given OST target from QoS table and releases related OSS structure
- * if no OSTs remain on the OSS.
- *
- * \param[in] lod              LOD device
- * \param[in] ost_desc         OST description
- *
- * \retval 0                   on success
- * \retval -ENOENT             if no OSS was found
- */
-int qos_del_tgt(struct lod_device *lod, struct lod_tgt_desc *ost_desc)
-{
-       struct lod_qos_oss *oss;
-       int                 rc = 0;
-       ENTRY;
-
-       down_write(&lod->lod_qos.lq_rw_sem);
-       oss = ost_desc->ltd_qos.ltq_oss;
-       if (!oss)
-               GOTO(out, rc = -ENOENT);
-
-       oss->lqo_ost_count--;
-       if (oss->lqo_ost_count == 0) {
-               CDEBUG(D_QOS, "removing OSS %s\n",
-                      obd_uuid2str(&oss->lqo_uuid));
-               list_del(&oss->lqo_oss_list);
-               ost_desc->ltd_qos.ltq_oss = NULL;
-               OBD_FREE_PTR(oss);
-       }
-
-       lod->lod_qos.lq_dirty = 1;
-       lod->lod_qos.lq_rr.lqr_dirty = 1;
-out:
-       up_write(&lod->lod_qos.lq_rw_sem);
-       RETURN(rc);
-}
-
-/**
  * Check whether the target is available for new OST objects.
  *
  * Request statfs data from the given target and verify it's active and not
@@ -324,12 +208,13 @@ out:
  */
 static int lod_qos_calc_ppo(struct lod_device *lod)
 {
-       struct lod_qos_oss *oss;
-       __u64               ba_max, ba_min, temp;
-       __u32               num_active;
-       unsigned int        i;
-       int                 rc, prio_wide;
-       time64_t            now, age;
+       struct lu_svr_qos *oss;
+       __u64 ba_max, ba_min, temp;
+       __u32 num_active;
+       unsigned int i;
+       int rc, prio_wide;
+       time64_t now, age;
+
        ENTRY;
 
        if (!lod->lod_qos.lq_dirty)
@@ -340,9 +225,9 @@ static int lod_qos_calc_ppo(struct lod_device *lod)
                GOTO(out, rc = -EAGAIN);
 
        /* find bavail on each OSS */
-       list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list)
-                           oss->lqo_bavail = 0;
-       lod->lod_qos.lq_active_oss_count = 0;
+       list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list)
+               oss->lsq_bavail = 0;
+       lod->lod_qos.lq_active_svr_count = 0;
 
        /*
         * How badly user wants to select OSTs "widely" (not recently chosen
@@ -366,9 +251,9 @@ static int lod_qos_calc_ppo(struct lod_device *lod)
                ba_max = max(temp, ba_max);
 
                /* Count the number of usable OSS's */
-               if (OST_TGT(lod,i)->ltd_qos.ltq_oss->lqo_bavail == 0)
-                       lod->lod_qos.lq_active_oss_count++;
-               OST_TGT(lod,i)->ltd_qos.ltq_oss->lqo_bavail += temp;
+               if (OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_bavail == 0)
+                       lod->lod_qos.lq_active_svr_count++;
+               OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_bavail += temp;
 
                /* per-OST penalty is prio * TGT_bavail / (num_ost - 1) / 2 */
                temp >>= 1;
@@ -386,7 +271,7 @@ static int lod_qos_calc_ppo(struct lod_device *lod)
                                (age / lod->lod_desc.ld_qos_maxage);
        }
 
-       num_active = lod->lod_qos.lq_active_oss_count - 1;
+       num_active = lod->lod_qos.lq_active_svr_count - 1;
        if (num_active < 1) {
                /* If there's only 1 OSS, we can't penalize it, so instead
                   we have to double the OST penalty */
@@ -396,18 +281,18 @@ static int lod_qos_calc_ppo(struct lod_device *lod)
        }
 
        /* Per-OSS penalty is prio * oss_avail / oss_osts / (num_oss - 1) / 2 */
-       list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list) {
-               temp = oss->lqo_bavail >> 1;
-               do_div(temp, oss->lqo_ost_count * num_active);
-               oss->lqo_penalty_per_obj = (temp * prio_wide) >> 8;
+       list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) {
+               temp = oss->lsq_bavail >> 1;
+               do_div(temp, oss->lsq_tgt_count * num_active);
+               oss->lsq_penalty_per_obj = (temp * prio_wide) >> 8;
 
-               age = (now - oss->lqo_used) >> 3;
+               age = (now - oss->lsq_used) >> 3;
                if (lod->lod_qos.lq_reset ||
                    age > 32 * lod->lod_desc.ld_qos_maxage)
-                       oss->lqo_penalty = 0;
+                       oss->lsq_penalty = 0;
                else if (age > lod->lod_desc.ld_qos_maxage)
                        /* Decay OSS penalty. */
-                       oss->lqo_penalty >>= age / lod->lod_desc.ld_qos_maxage;
+                       oss->lsq_penalty >>= age / lod->lod_desc.ld_qos_maxage;
        }
 
        lod->lod_qos.lq_dirty = 0;
@@ -447,12 +332,12 @@ static int lod_qos_calc_weight(struct lod_device *lod, int i)
        __u64 temp, temp2;
 
        temp = TGT_BAVAIL(i);
-       temp2 = OST_TGT(lod,i)->ltd_qos.ltq_penalty +
-               OST_TGT(lod,i)->ltd_qos.ltq_oss->lqo_penalty;
+       temp2 = OST_TGT(lod, i)->ltd_qos.ltq_penalty +
+               OST_TGT(lod, i)->ltd_qos.ltq_svr->lsq_penalty;
        if (temp < temp2)
-               OST_TGT(lod,i)->ltd_qos.ltq_weight = 0;
+               OST_TGT(lod, i)->ltd_qos.ltq_weight = 0;
        else
-               OST_TGT(lod,i)->ltd_qos.ltq_weight = temp - temp2;
+               OST_TGT(lod, i)->ltd_qos.ltq_weight = temp - temp2;
        return 0;
 }
 
@@ -474,7 +359,7 @@ static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts,
                        __u32 index, __u64 *total_wt)
 {
        struct lod_tgt_desc *ost;
-       struct lod_qos_oss  *oss;
+       struct lu_svr_qos  *oss;
        unsigned int j;
        ENTRY;
 
@@ -484,28 +369,28 @@ static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts,
        /* Don't allocate on this devuce anymore, until the next alloc_qos */
        ost->ltd_qos.ltq_usable = 0;
 
-       oss = ost->ltd_qos.ltq_oss;
+       oss = ost->ltd_qos.ltq_svr;
 
        /* Decay old penalty by half (we're adding max penalty, and don't
           want it to run away.) */
        ost->ltd_qos.ltq_penalty >>= 1;
-       oss->lqo_penalty >>= 1;
+       oss->lsq_penalty >>= 1;
 
        /* mark the OSS and OST as recently used */
-       ost->ltd_qos.ltq_used = oss->lqo_used = ktime_get_real_seconds();
+       ost->ltd_qos.ltq_used = oss->lsq_used = ktime_get_real_seconds();
 
        /* Set max penalties for this OST and OSS */
        ost->ltd_qos.ltq_penalty +=
                ost->ltd_qos.ltq_penalty_per_obj * lod->lod_ostnr;
-       oss->lqo_penalty += oss->lqo_penalty_per_obj *
-               lod->lod_qos.lq_active_oss_count;
+       oss->lsq_penalty += oss->lsq_penalty_per_obj *
+               lod->lod_qos.lq_active_svr_count;
 
        /* Decrease all OSS penalties */
-       list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list) {
-               if (oss->lqo_penalty < oss->lqo_penalty_per_obj)
-                       oss->lqo_penalty = 0;
+       list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) {
+               if (oss->lsq_penalty < oss->lsq_penalty_per_obj)
+                       oss->lsq_penalty = 0;
                else
-                       oss->lqo_penalty -= oss->lqo_penalty_per_obj;
+                       oss->lsq_penalty -= oss->lsq_penalty_per_obj;
        }
 
        *total_wt = 0;
@@ -539,21 +424,20 @@ static int lod_qos_used(struct lod_device *lod, struct ost_pool *osts,
                          i, ost->ltd_qos.ltq_usable, TGT_BAVAIL(i) >> 10,
                          ost->ltd_qos.ltq_penalty_per_obj >> 10,
                          ost->ltd_qos.ltq_penalty >> 10,
-                         ost->ltd_qos.ltq_oss->lqo_penalty_per_obj >> 10,
-                         ost->ltd_qos.ltq_oss->lqo_penalty >> 10,
+                         ost->ltd_qos.ltq_svr->lsq_penalty_per_obj >> 10,
+                         ost->ltd_qos.ltq_svr->lsq_penalty >> 10,
                          ost->ltd_qos.ltq_weight >> 10);
        }
 
        RETURN(0);
 }
 
-void lod_qos_rr_init(struct lod_qos_rr *lqr)
+void lod_qos_rr_init(struct lu_qos_rr *lqr)
 {
        spin_lock_init(&lqr->lqr_alloc);
        lqr->lqr_dirty = 1;
 }
 
-
 #define LOV_QOS_EMPTY ((__u32)-1)
 
 /**
@@ -573,9 +457,9 @@ void lod_qos_rr_init(struct lod_qos_rr *lqr)
  * \retval -ENOMEM     fails to allocate the array
  */
 static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool,
-                          struct lod_qos_rr *lqr)
+                          struct lu_qos_rr *lqr)
 {
-       struct lod_qos_oss  *oss;
+       struct lu_svr_qos  *oss;
        struct lod_tgt_desc *ost;
        unsigned placed, real_count;
        unsigned int i;
@@ -617,7 +501,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool,
 
        /* Place all the OSTs from 1 OSS at the same time. */
        placed = 0;
-       list_for_each_entry(oss, &lod->lod_qos.lq_oss_list, lqo_oss_list) {
+       list_for_each_entry(oss, &lod->lod_qos.lq_svr_list, lsq_svr_list) {
                int j = 0;
 
                for (i = 0; i < lqr->lqr_pool.op_count; i++) {
@@ -629,11 +513,11 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct ost_pool *src_pool,
 
                        ost = OST_TGT(lod,src_pool->op_array[i]);
                        LASSERT(ost && ost->ltd_ost);
-                       if (ost->ltd_qos.ltq_oss != oss)
+                       if (ost->ltd_qos.ltq_svr != oss)
                                continue;
 
                        /* Evenly space these OSTs across arrayspace */
-                       next = j * lqr->lqr_pool.op_count / oss->lqo_ost_count;
+                       next = j * lqr->lqr_pool.op_count / oss->lsq_tgt_count;
                        while (lqr->lqr_pool.op_array[next] != LOV_QOS_EMPTY)
                                next = (next + 1) % lqr->lqr_pool.op_count;
 
@@ -894,7 +778,7 @@ static inline bool lod_should_avoid_ost(struct lod_object *lo,
 {
        struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev);
        struct lod_tgt_desc *ost = OST_TGT(lod, index);
-       struct lod_qos_oss *lqo = ost->ltd_qos.ltq_oss;
+       struct lu_svr_qos *lsq = ost->ltd_qos.ltq_svr;
        bool used = false;
        int i;
 
@@ -913,7 +797,7 @@ static inline bool lod_should_avoid_ost(struct lod_object *lo,
 
        /* check OSS use */
        for (i = 0; i < lag->lag_oaa_count; i++) {
-               if (lag->lag_oss_avoid_array[i] == lqo->lqo_id) {
+               if (lag->lag_oss_avoid_array[i] == lsq->lsq_id) {
                        used = true;
                        break;
                }
@@ -1057,7 +941,7 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo,
        struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs;
        struct pool_desc  *pool = NULL;
        struct ost_pool   *osts;
-       struct lod_qos_rr *lqr;
+       struct lu_qos_rr *lqr;
        unsigned int    i, array_idx;
        __u32 ost_start_idx_temp;
        __u32 stripe_idx = 0;
@@ -2361,11 +2245,11 @@ void lod_collect_avoidance(struct lod_object *lo, struct lod_avoid_guide *lag,
                         */
                        for (j = 0; j < comp->llc_stripe_count; j++) {
                                struct lod_tgt_desc *ost;
-                               struct lod_qos_oss *lqo;
+                               struct lu_svr_qos *lsq;
                                int k;
 
                                ost = OST_TGT(lod, comp->llc_ost_indices[j]);
-                               lqo = ost->ltd_qos.ltq_oss;
+                               lsq = ost->ltd_qos.ltq_svr;
 
                                if (cfs_bitmap_check(bitmap, ost->ltd_index))
                                        continue;
@@ -2377,12 +2261,12 @@ void lod_collect_avoidance(struct lod_object *lo, struct lod_avoid_guide *lag,
 
                                for (k = 0; k < lag->lag_oaa_count; k++) {
                                        if (lag->lag_oss_avoid_array[k] ==
-                                           lqo->lqo_id)
+                                           lsq->lsq_id)
                                                break;
                                }
                                if (k == lag->lag_oaa_count) {
                                        lag->lag_oss_avoid_array[k] =
-                                                               lqo->lqo_id;
+                                                               lsq->lsq_id;
                                        lag->lag_oaa_count++;
                                }
                        }
index 66a67da..021d0e1 100644 (file)
@@ -12,6 +12,7 @@ obdclass-all-objs += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
 obdclass-all-objs += linkea.o
 obdclass-all-objs += kernelcomm.o jobid.o
 obdclass-all-objs += integrity.o obd_cksum.o
+obdclass-all-objs += lu_qos.o
 
 @SERVER_TRUE@obdclass-all-objs += acl.o
 @SERVER_TRUE@obdclass-all-objs += idmap.o
diff --git a/lustre/obdclass/lu_qos.c b/lustre/obdclass/lu_qos.c
new file mode 100644 (file)
index 0000000..0e04bdd
--- /dev/null
@@ -0,0 +1,168 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * GPL HEADER END
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * lustre/obdclass/lu_qos.c
+ *
+ * Lustre QoS.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for object allocation QoS
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <libcfs/libcfs.h>
+#include <libcfs/libcfs_hash.h> /* hash_long() */
+#include <libcfs/linux/linux-mem.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+
+/**
+ * Add a new target to Quality of Service (QoS) target table.
+ *
+ * Add a new MDT/OST target to the structure representing an OSS. Resort the
+ * list of known MDSs/OSSs by the number of MDTs/OSTs attached to each MDS/OSS.
+ * The MDS/OSS list is protected internally and no external locking is required.
+ *
+ * \param[in] qos              lu_qos data
+ * \param[in] ltd              target description
+ *
+ * \retval 0                   on success
+ * \retval -ENOMEM             on error
+ */
+int lqos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd)
+{
+       struct lu_svr_qos *svr = NULL;
+       struct lu_svr_qos *tempsvr;
+       struct obd_export *exp = ltd->ltd_exp;
+       int found = 0;
+       __u32 id = 0;
+       int rc = 0;
+
+       ENTRY;
+
+       down_write(&qos->lq_rw_sem);
+       /*
+        * a bit hacky approach to learn NID of corresponding connection
+        * but there is no official API to access information like this
+        * with OSD API.
+        */
+       list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) {
+               if (obd_uuid_equals(&svr->lsq_uuid,
+                                   &exp->exp_connection->c_remote_uuid)) {
+                       found++;
+                       break;
+               }
+               if (svr->lsq_id > id)
+                       id = svr->lsq_id;
+       }
+
+       if (!found) {
+               OBD_ALLOC_PTR(svr);
+               if (!svr)
+                       GOTO(out, rc = -ENOMEM);
+               memcpy(&svr->lsq_uuid, &exp->exp_connection->c_remote_uuid,
+                      sizeof(svr->lsq_uuid));
+               ++id;
+               svr->lsq_id = id;
+       } else {
+               /* Assume we have to move this one */
+               list_del(&svr->lsq_svr_list);
+       }
+
+       svr->lsq_tgt_count++;
+       ltd->ltd_qos.ltq_svr = svr;
+
+       CDEBUG(D_OTHER, "add tgt %s to server %s (%d targets)\n",
+              obd_uuid2str(&ltd->ltd_uuid), obd_uuid2str(&svr->lsq_uuid),
+              svr->lsq_tgt_count);
+
+       /*
+        * Add sorted by # of tgts.  Find the first entry that we're
+        * bigger than...
+        */
+       list_for_each_entry(tempsvr, &qos->lq_svr_list, lsq_svr_list) {
+               if (svr->lsq_tgt_count > tempsvr->lsq_tgt_count)
+                       break;
+       }
+       /*
+        * ...and add before it.  If we're the first or smallest, tempsvr
+        * points to the list head, and we add to the end.
+        */
+       list_add_tail(&svr->lsq_svr_list, &tempsvr->lsq_svr_list);
+
+       qos->lq_dirty = 1;
+       qos->lq_rr.lqr_dirty = 1;
+
+out:
+       up_write(&qos->lq_rw_sem);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lqos_add_tgt);
+
+/**
+ * Remove MDT/OST target from QoS table.
+ *
+ * Removes given MDT/OST target from QoS table and releases related
+ * MDS/OSS structure if no target remain on the MDS/OSS.
+ *
+ * \param[in] qos              lu_qos data
+ * \param[in] ltd              target description
+ *
+ * \retval 0                   on success
+ * \retval -ENOENT             if no server was found
+ */
+int lqos_del_tgt(struct lu_qos *qos, struct lu_tgt_desc *ltd)
+{
+       struct lu_svr_qos *svr;
+       int rc = 0;
+
+       ENTRY;
+
+       down_write(&qos->lq_rw_sem);
+       svr = ltd->ltd_qos.ltq_svr;
+       if (!svr)
+               GOTO(out, rc = -ENOENT);
+
+       svr->lsq_tgt_count--;
+       if (svr->lsq_tgt_count == 0) {
+               CDEBUG(D_OTHER, "removing server %s\n",
+                      obd_uuid2str(&svr->lsq_uuid));
+               list_del(&svr->lsq_svr_list);
+               ltd->ltd_qos.ltq_svr = NULL;
+               OBD_FREE_PTR(svr);
+       }
+
+       qos->lq_dirty = 1;
+       qos->lq_rr.lqr_dirty = 1;
+out:
+       up_write(&qos->lq_rw_sem);
+       RETURN(rc);
+}
+EXPORT_SYMBOL(lqos_del_tgt);
old mode 100755 (executable)
new mode 100644 (file)
index 1b72d2a..401803d
@@ -19952,7 +19952,7 @@ test_412() {
 }
 run_test 412 "mkdir on specific MDTs"
 
-test_413() {
+test_413a() {
        [ $MDSCOUNT -lt 2 ] &&
                skip "We need at least 2 MDTs for this test"
 
@@ -19986,7 +19986,129 @@ test_413() {
                        error "don't expect $max"
        done
 }
-run_test 413 "mkdir on less full MDTs"
+run_test 413a "mkdir on less full MDTs"
+
+test_413b() {
+       [ $MDSCOUNT -lt 2 ] &&
+               skip "We need at least 2 MDTs for this test"
+
+       [ $MDS1_VERSION -lt $(version_code 2.12.52) ] &&
+               skip "Need server version at least 2.12.52"
+
+       mkdir $DIR/$tdir || error "mkdir failed"
+       $LFS setdirstripe -D -i -1 -H space $DIR/$tdir ||
+               error "setdirstripe failed"
+
+       local qos_prio_free
+       local qos_threshold_rr
+       local count
+
+       qos_prio_free=$($LCTL get_param -n lmv.*.qos_prio_free | head -n1)
+       qos_prio_free=${qos_prio_free%%%}
+       qos_threshold_rr=$($LCTL get_param -n lmv.*.qos_threshold_rr | head -n1)
+       qos_threshold_rr=${qos_threshold_rr%%%}
+
+       stack_trap "$LCTL set_param lmv.*.qos_prio_free=$qos_prio_free" EXIT
+       stack_trap "$LCTL set_param lmv.*.qos_threshold_rr=$qos_threshold_rr" \
+               EXIT
+
+       echo "mkdir with roundrobin"
+
+       $LCTL set_param lmv.*.qos_threshold_rr=100
+       for i in $(seq $((100 * MDSCOUNT))); do
+               mkdir $DIR/$tdir/subdir$i || error "mkdir subdir$i failed"
+       done
+       for i in $(seq $MDSCOUNT); do
+               count=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$((i - 1))$ |
+                       wc -w)
+               echo "$count directories created on MDT$((i - 1))"
+               [ $count -eq 100 ] || error "subdirs are not evenly distributed"
+       done
+
+       rm -rf $DIR/$tdir/*
+
+       $LCTL set_param lmv.*.qos_threshold_rr=$qos_threshold_rr
+
+       local ffree
+       local max
+       local min
+       local max_index
+       local min_index
+
+       ffree=($(lctl get_param -n mdc.*[mM][dD][cC]-[^M]*.filesfree | uniq))
+       echo "MDT filesfree available: ${ffree[@]}"
+       max=${ffree[0]}
+       min=${ffree[0]}
+       max_index=0
+       min_index=0
+       for ((i = 0; i < ${#ffree[@]}; i++)); do
+               if [[ ${ffree[i]} -gt $max ]]; then
+                       max=${ffree[i]}
+                       max_index=$i
+               fi
+               if [[ ${ffree[i]} -lt $min ]]; then
+                       min=${ffree[i]}
+                       min_index=$i
+               fi
+       done
+       echo "Min free files: MDT$min_index: $min"
+       echo "Max free files: MDT$max_index: $max"
+
+       [ $min -eq 0 ] && skip "no free files in MDT$min_index"
+       [ $min -gt 10000000 ] && skip "too much free files in MDT$min_index"
+
+       # Check if we need to generate uneven MDTs
+       test_mkdir -i $min_index -c 1 -p $DIR/$tdir-MDT$min_index
+       local threshold=10
+       local diff=$((max - min))
+       local diff2=$((diff * 100 / min))
+
+       echo -n "Check for uneven MDTs: "
+       echo -n "diff=$diff files ($diff2%) must be > $threshold% ..."
+
+       if [ $diff2 -gt $threshold ]; then
+               echo "ok"
+               echo "Don't need to fill MDT$min_index"
+       else
+               # generate uneven MDTs, create till 25% diff
+               echo "no"
+               diff2=$((threshold - diff2))
+               diff=$((min * diff2 / 100))
+               # 50 sec per 10000 files in vm
+               [ $diff -gt 40000 ] && [ "$SLOW" = "no" ] &&
+                       skip "$diff files to create"
+               echo "Fill $diff2% diff in MDT$min_index with $diff files"
+               local i
+               local value="$(generate_string 1024)"
+               for i in $(seq $diff); do
+                       $OPENFILE -f O_CREAT:O_LOV_DELAY_CREATE \
+                               $DIR/$tdir-MDT$min_index/f$i > /dev/null ||
+                               error "create f$i failed"
+                       setfattr -n user.413b -v $value \
+                               $DIR/$tdir-MDT$min_index/f$i ||
+                               error "setfattr f$i failed"
+               done
+       fi
+
+       min=$((100 *MDSCOUNT))
+       max=0
+
+       echo "mkdir with balanced space usage"
+       $LCTL set_param lmv.*.qos_prio_free=100
+       for i in $(seq $((100 * MDSCOUNT))); do
+               mkdir $DIR/$tdir/subdir$i || error "mkdir subdir$i failed"
+       done
+       for i in $(seq $MDSCOUNT); do
+               count=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$((i - 1))$ |
+                       wc -w)
+               echo "$count directories created on MDT$((i - 1))"
+               [ $min -gt $count ] && min=$count
+               [ $max -lt $count ] && max=$count
+       done
+       [ $((max - min)) -gt $MDSCOUNT ] ||
+               error "subdirs shouldn't be evenly distributed"
+}
+run_test 413b "mkdir with balanced space usage"
 
 test_414() {
 #define OBD_FAIL_PTLRPC_BULK_ATTACH      0x521