Whamcloud - gitweb
LU-822 osd: multiple Object Index files
authorLiang Zhen <liang@whamcloud.com>
Thu, 8 Dec 2011 16:48:29 +0000 (00:48 +0800)
committerOleg Drokin <green@whamcloud.com>
Sun, 8 Jan 2012 17:51:26 +0000 (12:51 -0500)
Single OI container could be performance bottleneck on server side,
because many service threads may content on the same OI htree-tree
even OI htree can support parallel operations but there are
still a lot of spinlock contentions and cacheline contentions.
Also, parallel operations of OI htree can't scale very well if
there are hundreds or thousands threads, it is because limitation
of dynlock. Instead of fix scalability of dynlock, the long term
solution is more straightforward, we can simply support multiple OI
containers and hash service threads to different OIs by lu_fid::f_seq.
We need to make sure this feature can support single OI created
by 2.1 or earlier versions, also, user can specify number of OIs by
modparameter osd_oi_num on creating new filesystem, this parameter
will be ignored if OSD is loading on existed filesystem.

Signed-off-by: Liang Zhen <liang@whamcloud.com>
Change-Id: Iaa5ef9e43b80301150608802e40b4ef506467457
Reviewed-on: http://review.whamcloud.com/1822
Tested-by: Hudson
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Fan Yong <yong.fan@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
lustre/include/lustre_disk.h
lustre/include/lustre_fid.h
lustre/mdt/mdt_internal.h
lustre/mdt/mdt_recovery.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_oi.c
lustre/osd-ldiskfs/osd_oi.h

index 2e4edcb..2db5ac0 100644 (file)
@@ -280,6 +280,8 @@ struct lustre_mount_data {
  * will be confused by interpreting stripe_count | gen << 16 as the actual
  * stripe count */
 #define OBD_INCOMPAT_LMM_VER    0x00000100
+/** multiple OI files for MDT */
+#define OBD_INCOMPAT_MULTI_OI   0x00000200
 
 /* Data stored per server at the head of the last_rcvd file.  In le32 order.
    This should be common to filter_internal.h, lustre_mds.h */
index 4748466..55ecd77 100644 (file)
@@ -88,10 +88,15 @@ enum {
         LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH)
 };
 
+enum {
+        /** 2^6 FIDs for OI containers */
+        OSD_OI_FID_OID_BITS     = 6,
+        /** reserve enough FIDs in case we want more in the future */
+        OSD_OI_FID_OID_BITS_MAX = 10,
+};
+
 /** special OID for local objects */
 enum local_oid {
-        /** \see osd_oi_index_create */
-        OSD_OI_FID_16_OID       = 2UL,
         /** \see fld_mod_init */
         FLD_INDEX_OID           = 3UL,
         /** \see fid_mod_init */
@@ -107,6 +112,22 @@ enum local_oid {
         MDT_LAST_RECV_OID       = 11UL,
         /** \see osd_mod_init */
         OSD_REM_OBJ_DIR_OID     = 12UL,
+        OSD_FS_ROOT_OID         = 13UL,
+        ACCT_USER_OID           = 15UL,
+        ACCT_GROUP_OID          = 16UL,
+        OFD_LAST_RECV_OID       = 19UL,
+        OFD_GROUP0_LAST_OID     = 20UL,
+        OFD_GROUP4K_LAST_OID    = 20UL+4096,
+        OFD_LAST_GROUP_OID      = 4117UL,
+        LLOG_CATALOGS_OID       = 4118UL,
+        MGS_CONFIGS_OID         = 4119UL,
+        OFD_HEALTH_CHECK_OID    = 4120UL,
+
+        /** first OID for first OI fid */
+        OSD_OI_FID_OID_FIRST    = 5000UL,
+        /** reserve enough in case we want to have more in the future */
+        OSD_OI_FID_OID_MAX      = OSD_OI_FID_OID_FIRST +
+                                  (1UL << OSD_OI_FID_OID_BITS_MAX),
 };
 
 static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid)
index 8a10a6e..7782a11 100644 (file)
@@ -179,7 +179,7 @@ struct mdt_device {
 #define MDT_ROCOMPAT_SUPP       (OBD_ROCOMPAT_LOVOBJID)
 #define MDT_INCOMPAT_SUPP       (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR | \
                                  OBD_INCOMPAT_FID | OBD_INCOMPAT_IAM_DIR | \
-                                 OBD_INCOMPAT_LMM_VER)
+                                 OBD_INCOMPAT_LMM_VER | OBD_INCOMPAT_MULTI_OI)
 #define MDT_COS_DEFAULT         (0)
 
 struct mdt_object {
index abc604c..2ea6ed8 100644 (file)
@@ -368,7 +368,8 @@ static int mdt_server_data_init(const struct lu_env *env,
                 lsd->lsd_feature_compat = OBD_COMPAT_MDT;
                 lsd->lsd_feature_rocompat = OBD_ROCOMPAT_LOVOBJID;
                 lsd->lsd_feature_incompat = OBD_INCOMPAT_MDT |
-                                            OBD_INCOMPAT_COMMON_LR;
+                                            OBD_INCOMPAT_COMMON_LR |
+                                            OBD_INCOMPAT_MULTI_OI;
         } else {
                 LCONSOLE_WARN("%s: used disk, loading\n", obd->obd_name);
                 rc = mdt_last_rcvd_header_read(env, mdt);
index a8111eb..72bfd42 100644 (file)
@@ -432,7 +432,6 @@ static int osd_fid_lookup(const struct lu_env *env,
         struct lu_device       *ldev = obj->oo_dt.do_lu.lo_dev;
         struct osd_device      *dev;
         struct osd_inode_id    *id;
-        struct osd_oi          *oi;
         struct inode           *inode;
         int                     result;
 
@@ -451,12 +450,11 @@ static int osd_fid_lookup(const struct lu_env *env,
         info = osd_oti_get(env);
         dev  = osd_dev(ldev);
         id   = &info->oti_id;
-        oi   = &dev->od_oi;
 
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT))
                 RETURN(-ENOENT);
 
-        result = osd_oi_lookup(info, oi, fid, id);
+        result = osd_oi_lookup(info, osd_fid2oi(dev, fid), fid, id);
         if (result != 0) {
                 if (result == -ENOENT)
                         result = 0;
@@ -1888,7 +1886,7 @@ static int __osd_oi_insert(const struct lu_env *env, struct osd_object *obj,
         id->oii_ino = obj->oo_inode->i_ino;
         id->oii_gen = obj->oo_inode->i_generation;
 
-        return osd_oi_insert(info, &osd->od_oi, fid, id, th,
+        return osd_oi_insert(info, osd_fid2oi(osd, fid), fid, id, th,
                              uc->mu_cap & CFS_CAP_SYS_RESOURCE_MASK);
 }
 
@@ -1994,7 +1992,8 @@ static int osd_object_destroy(const struct lu_env *env,
 
         OSD_EXEC_OP(th, destroy);
 
-        result = osd_oi_delete(osd_oti_get(env), &osd->od_oi, fid, th);
+        result = osd_oi_delete(osd_oti_get(env),
+                               osd_fid2oi(osd, fid), fid, th);
 
         /* XXX: add to ext3 orphan list */
         /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */
@@ -4419,7 +4418,8 @@ static int osd_shutdown(const struct lu_env *env, struct osd_device *o)
                 lu_object_put(env, &o->od_obj_area->do_lu);
                 o->od_obj_area = NULL;
         }
-        osd_oi_fini(info, &o->od_oi);
+        if (o->od_oi_table != NULL)
+                osd_oi_fini(info, &o->od_oi_table, o->od_oi_count);
 
         RETURN(0);
 }
@@ -4571,11 +4571,14 @@ static int osd_prepare(const struct lu_env *env,
 
         ENTRY;
         /* 1. initialize oi before any file create or file open */
-        result = osd_oi_init(oti, &osd->od_oi,
+        result = osd_oi_init(oti, &osd->od_oi_table,
                              &osd->od_dt_dev, lu2md_dev(pdev));
-        if (result != 0)
+        if (result < 0)
                 RETURN(result);
 
+        LASSERT(result > 0);
+        osd->od_oi_count = result;
+
         lmi = osd->od_mount;
         lsi = s2lsi(lmi->lmi_sb);
         ldd = lsi->lsi_ldd;
index 37fae5a..a9450a4 100644 (file)
@@ -155,14 +155,15 @@ struct osd_device {
         struct dt_device          od_dt_dev;
         /* information about underlying file system */
         struct lustre_mount_info *od_mount;
-        /* object index */
-        struct osd_oi             od_oi;
         /*
          * XXX temporary stuff for object index: directory where every object
          * is named by its fid.
          */
         struct dt_object         *od_obj_area;
-
+        /* object index */
+        struct osd_oi            *od_oi_table;
+        /* total number of OI containers */
+        int                       od_oi_count;
         /*
          * Fid Capability
          */
@@ -394,5 +395,15 @@ static inline int osd_fid_is_igif(const struct lu_fid *fid)
         return fid_is_igif(fid) || osd_fid_is_root(fid);
 }
 
+static inline struct osd_oi *
+osd_fid2oi(struct osd_device *osd, const struct lu_fid *fid)
+{
+        if (!fid_is_norm(fid))
+                return NULL;
+
+        LASSERT(osd->od_oi_table != NULL && osd->od_oi_count >= 1);
+        return &osd->od_oi_table[fid->f_seq % osd->od_oi_count];
+}
+
 #endif /* __KERNEL__ */
 #endif /* _OSD_INTERNAL_H */
index c63b7f5..bcad98e 100644 (file)
 #include "osd_igif.h"
 #include "dt_object.h"
 
-struct oi_descr {
-        int   fid_size;
-        char *name;
-        __u32 oid;
-};
+#define OSD_OI_FID_NR         (1UL << OSD_OI_FID_OID_BITS)
+#define OSD_OI_FID_NR_MAX     (1UL << OSD_OI_FID_OID_BITS_MAX)
+
+static unsigned int osd_oi_num = OSD_OI_FID_NR;
+CFS_MODULE_PARM(osd_oi_num, "i", int, 0444,
+                "Number of Object Index containers to be created, "
+                "it's only valid for new filesystem.");
 
 /** to serialize concurrent OI index initialization */
 static cfs_mutex_t oi_init_lock;
@@ -90,39 +92,152 @@ static struct dt_index_features oi_feat = {
         .dif_ptrsize     = 4
 };
 
-static const struct oi_descr oi_descr[OSD_OI_FID_NR] = {
-        [OSD_OI_FID_16] = {
-                .fid_size = sizeof(struct lu_fid),
-                .name     = "oi.16",
-                .oid      = OSD_OI_FID_16_OID
+#define OSD_OI_NAME_BASE        "oi.16"
+
+/**
+ * Open an OI(Ojbect Index) container.
+ *
+ * \param       name    Name of OI container
+ * \param       objp    Pointer of returned OI
+ *
+ * \retval      0       success
+ * \retval      -ve     failure
+ */
+static int
+osd_oi_open(struct osd_thread_info *info,
+            struct dt_device *dev, char *name, struct dt_object **objp)
+{
+        const struct lu_env *env = info->oti_env;
+        struct dt_object    *obj;
+        int                  rc;
+
+        obj = dt_store_open(env, dev, "", name, &info->oti_fid);
+        if (IS_ERR(obj))
+                return PTR_ERR(obj);
+
+        oi_feat.dif_keysize_min = sizeof(info->oti_fid);
+        oi_feat.dif_keysize_max = sizeof(info->oti_fid);
+
+        rc = obj->do_ops->do_index_try(env, obj, &oi_feat);
+        if (rc != 0) {
+                lu_object_put(info->oti_env, &obj->do_lu);
+                CERROR("%s: wrong index %s: rc = %d\n",
+                       dev->dd_lu_dev.ld_obd->obd_name, name, rc);
+                return rc;
         }
-};
 
-static int osd_oi_index_create(struct osd_thread_info *info,
+        *objp = obj;
+        return 0;
+}
+
+
+static void
+osd_oi_table_put(struct osd_thread_info *info,
+                 struct osd_oi *oi_table, unsigned oi_count)
+{
+        int     i;
+
+        for (i = 0; i < oi_count; i++) {
+                LASSERT(oi_table[i].oi_dir != NULL);
+
+                lu_object_put(info->oti_env, &oi_table[i].oi_dir->do_lu);
+                oi_table[i].oi_dir = NULL;
+        }
+}
+
+/**
+ * Open OI(Object Index) table.
+ * If \a oi_count is zero, which means caller doesn't know how many OIs there
+ * will be, this function can either return 0 for new filesystem, or number
+ * of OIs on existed filesystem.
+ *
+ * If \a oi_count is non-zero, which means caller does know number of OIs on
+ * filesystem, this function should return the exactly same number on
+ * success, or error code in failure.
+ *
+ * \param     oi_count  Number of expected OI containers
+ * \param     try_all   Try to open all OIs even see failures
+ *
+ * \retval    +ve       number of opened OI containers
+ * \retval      0       no OI containers found
+ * \retval    -ve       failure
+ */
+static int
+osd_oi_table_open(struct osd_thread_info *info, struct dt_device *dev,
+                  struct osd_oi *oi_table, unsigned oi_count, int try_all)
+{
+        int     count = 0;
+        int     rc = 0;
+        int     i;
+
+        /* NB: oi_count != 0 means that we have already created/known all OIs
+         * and have known exact number of OIs. */
+        LASSERT(oi_count <= OSD_OI_FID_NR_MAX);
+
+        for (i = 0; i < (oi_count != 0 ? oi_count : OSD_OI_FID_NR_MAX); i++) {
+                char name[12];
+
+                sprintf(name, "%s.%d", OSD_OI_NAME_BASE, i);
+                rc = osd_oi_open(info, dev, name, &oi_table[i].oi_dir);
+                if (rc == 0) {
+                        count++;
+                        continue;
+                }
+
+                if (try_all)
+                        continue;
+
+                if (rc == -ENOENT && oi_count == 0)
+                        return count;
+
+                CERROR("%s: can't open %s: rc = %d\n",
+                       dev->dd_lu_dev.ld_obd->obd_name, name, rc);
+
+                if (oi_count > 0) {
+                        CERROR("%s: expect to open total %d OI files.\n",
+                               dev->dd_lu_dev.ld_obd->obd_name, oi_count);
+                }
+
+                break;
+        }
+
+        if (try_all)
+                return count;
+
+        if (rc < 0) {
+                osd_oi_table_put(info, oi_table, count);
+                return rc;
+        }
+
+        return count;
+}
+
+static int osd_oi_table_create(struct osd_thread_info *info,
                                struct dt_device *dev,
-                               struct md_device *mdev)
+                               struct md_device *mdev, int oi_count)
 {
         const struct lu_env *env;
-        struct lu_fid *oi_fid = &info->oti_fid;
         struct md_object *mdo;
         int i;
-        int rc;
 
         env = info->oti_env;
+        for (i = 0; i < oi_count; ++i) {
+                char name[12];
 
-        for (i = rc = 0; i < OSD_OI_FID_NR && rc == 0; ++i) {
-                char *name;
-                name = oi_descr[i].name;
-                lu_local_obj_fid(oi_fid, oi_descr[i].oid);
-                oi_feat.dif_keysize_min = oi_descr[i].fid_size,
-                oi_feat.dif_keysize_max = oi_descr[i].fid_size,
+                sprintf(name, "%s.%d", OSD_OI_NAME_BASE, i);
 
-                mdo = llo_store_create_index(env, mdev, dev,
-                                             "", name,
-                                             oi_fid, &oi_feat);
+                lu_local_obj_fid(&info->oti_fid, OSD_OI_FID_OID_FIRST + i);
+                oi_feat.dif_keysize_min = sizeof(info->oti_fid);
+                oi_feat.dif_keysize_max = sizeof(info->oti_fid);
 
-                if (IS_ERR(mdo))
+                mdo = llo_store_create_index(env, mdev, dev, "", name,
+                                             &info->oti_fid, &oi_feat);
+                if (IS_ERR(mdo)) {
+                        CERROR("Failed to create OI[%d] on %s: %d\n",
+                               i, dev->dd_lu_dev.ld_obd->obd_name,
+                               (int)PTR_ERR(mdo));
                         RETURN(PTR_ERR(mdo));
+                }
 
                 lu_object_put(env, &mdo->mo_lu);
         }
@@ -130,68 +245,62 @@ static int osd_oi_index_create(struct osd_thread_info *info,
 }
 
 int osd_oi_init(struct osd_thread_info *info,
-                struct osd_oi *oi,
+                struct osd_oi **oi_table,
                 struct dt_device *dev,
                 struct md_device *mdev)
 {
-        const struct lu_env *env;
+        struct osd_oi *oi;
         int rc;
-        int i;
 
-        env = info->oti_env;
+        OBD_ALLOC(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
+        if (oi == NULL)
+                return -ENOMEM;
+
         cfs_mutex_lock(&oi_init_lock);
-        memset(oi, 0, sizeof *oi);
-retry:
-        for (i = rc = 0; i < OSD_OI_FID_NR && rc == 0; ++i) {
-                const char       *name;
-                struct dt_object *obj;
-
-                name = oi_descr[i].name;
-                oi_feat.dif_keysize_min = oi_descr[i].fid_size,
-                oi_feat.dif_keysize_max = oi_descr[i].fid_size,
-
-                obj = dt_store_open(env, dev, "", name, &info->oti_fid);
-                if (!IS_ERR(obj)) {
-                        rc = obj->do_ops->do_index_try(env, obj, &oi_feat);
-                        if (rc == 0) {
-                                LASSERT(obj->do_index_ops != NULL);
-                                oi->oi_dir = obj;
-                        } else {
-                                CERROR("Wrong index \"%s\": %d\n", name, rc);
-                                lu_object_put(env, &obj->do_lu);
-                        }
-                } else {
-                        rc = PTR_ERR(obj);
-                        if (rc == -ENOENT) {
-                                rc = osd_oi_index_create(info, dev, mdev);
-                                if (!rc)
-                                        goto retry;
-                        }
-                        CERROR("Cannot open \"%s\": %d\n", name, rc);
-                }
+
+        rc = osd_oi_table_open(info, dev, oi, 0, 0);
+        if (rc != 0)
+                goto out;
+
+        rc = osd_oi_open(info, dev, OSD_OI_NAME_BASE, &oi[0].oi_dir);
+        if (rc == 0) { /* found single OI from old filesystem */
+                rc = 1;
+                goto out;
+        }
+
+        if (rc != -ENOENT) {
+                CERROR("%s: can't open %s: rc = %d\n",
+                       dev->dd_lu_dev.ld_obd->obd_name, OSD_OI_NAME_BASE, rc);
+                goto out;
         }
+
+        /* create OI objects */
+        rc = osd_oi_table_create(info, dev, mdev, osd_oi_num);
         if (rc != 0)
-                osd_oi_fini(info, oi);
+                goto out;
+
+        rc = osd_oi_table_open(info, dev, oi, osd_oi_num, 0);
+        LASSERT(rc == osd_oi_num || rc < 0);
+
+ out:
+        if (rc < 0)
+                OBD_FREE(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
+        else
+                *oi_table = oi;
 
         cfs_mutex_unlock(&oi_init_lock);
         return rc;
 }
 
-void osd_oi_fini(struct osd_thread_info *info, struct osd_oi *oi)
+void osd_oi_fini(struct osd_thread_info *info,
+                 struct osd_oi **oi_table, unsigned oi_count)
 {
-        if (oi->oi_dir != NULL) {
-                lu_object_put(info->oti_env, &oi->oi_dir->do_lu);
-                oi->oi_dir = NULL;
-        }
-}
+        struct osd_oi *oi = *oi_table;
 
-static inline int fid_is_oi_fid(const struct lu_fid *fid)
-{
-        /* We need to filter-out oi obj's fid. As we can not store it, while
-         * oi-index create operation.
-         */
-        return (unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE &&
-                fid_oid(fid) == OSD_OI_FID_16_OID));
+        osd_oi_table_put(info, oi, oi_count);
+
+        OBD_FREE(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX);
+        *oi_table = NULL;
 }
 
 int osd_oi_lookup(struct osd_thread_info *info, struct osd_oi *oi,
@@ -207,7 +316,7 @@ int osd_oi_lookup(struct osd_thread_info *info, struct osd_oi *oi,
                 struct dt_object    *idx;
                 const struct dt_key *key;
 
-                if (fid_is_oi_fid(fid))
+                if (!fid_is_norm(fid))
                         return -ENOENT;
 
                 idx = oi->oi_dir;
@@ -235,10 +344,7 @@ int osd_oi_insert(struct osd_thread_info *info, struct osd_oi *oi,
         struct osd_inode_id *id;
         const struct dt_key *key;
 
-        if (osd_fid_is_igif(fid))
-                return 0;
-
-        if (fid_is_oi_fid(fid))
+        if (!fid_is_norm(fid))
                 return 0;
 
         idx = oi->oi_dir;
@@ -262,7 +368,7 @@ int osd_oi_delete(struct osd_thread_info *info,
         struct dt_object    *idx;
         const struct dt_key *key;
 
-        if (osd_fid_is_igif(fid))
+        if (!fid_is_norm(fid))
                 return 0;
 
         idx = oi->oi_dir;
@@ -274,6 +380,9 @@ int osd_oi_delete(struct osd_thread_info *info,
 
 int osd_oi_mod_init()
 {
+        if (osd_oi_num == 0 || osd_oi_num > OSD_OI_FID_NR_MAX)
+                osd_oi_num = OSD_OI_FID_NR;
+
         cfs_mutex_init(&oi_init_lock);
         return 0;
 }
index 3f0b09f..d8f5377 100644 (file)
@@ -53,6 +53,7 @@
 
 /* struct rw_semaphore */
 #include <linux/rwsem.h>
+#include <lustre_fid.h>
 #include <lu_object.h>
 #include <md_object.h>
 
@@ -63,11 +64,6 @@ struct thandle;
 
 struct dt_device;
 
-enum {
-        OSD_OI_FID_16,
-        OSD_OI_FID_NR
-};
-
 /*
  * Object Index (oi) instance.
  */
@@ -92,10 +88,11 @@ struct osd_inode_id {
 
 int osd_oi_mod_init(void);
 int osd_oi_init(struct osd_thread_info *info,
-                struct osd_oi *oi,
+                struct osd_oi **oi_table,
                 struct dt_device *dev,
                 struct md_device *mdev);
-void osd_oi_fini(struct osd_thread_info *info, struct osd_oi *oi);
+void osd_oi_fini(struct osd_thread_info *info,
+                 struct osd_oi **oi_table, unsigned oi_count);
 
 int  osd_oi_lookup(struct osd_thread_info *info, struct osd_oi *oi,
                    const struct lu_fid *fid, struct osd_inode_id *id);