Whamcloud - gitweb
LU-5164 osd: Limit lu_object cache 37/10237/3
authorBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 24 Apr 2014 20:47:47 +0000 (13:47 -0700)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 18 Jun 2014 03:18:29 +0000 (03:18 +0000)
For OSDs like ZFS to perform optimally it's import that they be allowed
to manage their own cache.  This maximizes the likelyhood that the ARC
will prefetch and cache the right buffers.  In the existing ZFS OSD
code a cached LU object pins buffers in the ARC preventing them from
being dropped.  As the LU cache grows it can consume the entire ARC
preventing buffers for other objects, such as the OIs, from being
cached and severely impacting the performance for FID lookups.

By default this patch will only limit the LU cache for ZFS OSDs.

NOTES:

* Setting LU_CACHE_NR_ZFS_LIMIT to 0 results in an LBUG on the MDS.
  This may be because an object is being used without a reference.
  Setting a minimum value of 256 was arbitrary, ideally we would
  set this value to 0.

* In order to be able to quickly determine the number of objects in
  the hash table the CFS_HASH_COUNTER flag is added.  This adds an
  atomic_inc/dec to the hash insert/remove paths but is not expected
  to have any measurable impact of performance.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Change-Id: Ia64838d50395f1d22e558631adbfa39d44e91606
Patch-Set: 3
Reviewed-on: http://review.whamcloud.com/10237
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/obdclass/lu_object.c

index 3fac810..fe43331 100644 (file)
 
 extern spinlock_t obd_types_lock;
 
 
 extern spinlock_t obd_types_lock;
 
+enum {
+       LU_CACHE_PERCENT_MAX     = 50,
+       LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+#define        LU_CACHE_NR_MAX_ADJUST          128
+#define        LU_CACHE_NR_UNLIMITED           -1
+#define        LU_CACHE_NR_DEFAULT             LU_CACHE_NR_UNLIMITED
+#define        LU_CACHE_NR_LDISKFS_LIMIT       LU_CACHE_NR_UNLIMITED
+#define        LU_CACHE_NR_ZFS_LIMIT           256
+
+#define LU_SITE_BITS_MIN    12
+#define LU_SITE_BITS_MAX    24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS    8
+
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644,
+               "Percentage of memory to be used as lu_object cache");
+
+static long lu_cache_nr = LU_CACHE_NR_DEFAULT;
+CFS_MODULE_PARM(lu_cache_nr, "l", long, 0644,
+               "Maximum number of objects in lu_object cache");
+
 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
 
 /**
 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
 
 /**
@@ -621,6 +650,30 @@ struct lu_object *lu_object_find(const struct lu_env *env,
 }
 EXPORT_SYMBOL(lu_object_find);
 
 }
 EXPORT_SYMBOL(lu_object_find);
 
+/*
+ * Limit the lu_object cache to a maximum of lu_cache_nr objects.  Because
+ * the calculation for the number of objects to reclaim is not covered by
+ * a lock the maximum number of objects is capped by LU_CACHE_MAX_ADJUST.
+ * This ensures that many concurrent threads will not accidentally purge
+ * the entire cache.
+ */
+static void lu_object_limit(const struct lu_env *env,
+                           struct lu_device *dev)
+{
+       __u64 size, nr;
+
+       if (lu_cache_nr == LU_CACHE_NR_UNLIMITED)
+               return;
+
+       size = cfs_hash_size_get(dev->ld_site->ls_obj_hash);
+       nr = (__u64)lu_cache_nr;
+       if (size > nr)
+               lu_site_purge(env, dev->ld_site,
+                             MIN(size - nr, LU_CACHE_NR_MAX_ADJUST));
+
+       return;
+}
+
 static struct lu_object *lu_object_new(const struct lu_env *env,
                                        struct lu_device *dev,
                                        const struct lu_fid *f,
 static struct lu_object *lu_object_new(const struct lu_env *env,
                                        struct lu_device *dev,
                                        const struct lu_fid *f,
@@ -641,6 +694,9 @@ static struct lu_object *lu_object_new(const struct lu_env *env,
         cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
         bkt->lsb_busy++;
         cfs_hash_bd_unlock(hs, &bd, 1);
         cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
         bkt->lsb_busy++;
         cfs_hash_bd_unlock(hs, &bd, 1);
+
+       lu_object_limit(env, dev);
+
         return o;
 }
 
         return o;
 }
 
@@ -711,6 +767,9 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env,
                 cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
                 bkt->lsb_busy++;
                 cfs_hash_bd_unlock(hs, &bd, 1);
                 cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
                 bkt->lsb_busy++;
                 cfs_hash_bd_unlock(hs, &bd, 1);
+
+               lu_object_limit(env, dev);
+
                 return o;
         }
 
                 return o;
         }
 
@@ -883,23 +942,26 @@ void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
 }
 EXPORT_SYMBOL(lu_site_print);
 
 }
 EXPORT_SYMBOL(lu_site_print);
 
-enum {
-        LU_CACHE_PERCENT_MAX     = 50,
-        LU_CACHE_PERCENT_DEFAULT = 20
-};
-
-static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
-CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644,
-                "Percentage of memory to be used as lu_object cache");
-
 /**
  * Return desired hash table order.
  */
 /**
  * Return desired hash table order.
  */
-static int lu_htable_order(void)
+static int lu_htable_order(struct lu_device *top)
 {
         unsigned long cache_size;
         int bits;
 
 {
         unsigned long cache_size;
         int bits;
 
+       /*
+        * For ZFS based OSDs the cache should be disabled by default.  This
+        * allows the ZFS ARC maximum flexibility in determining what buffers
+        * to cache.  If Lustre has objects or buffer which it wants to ensure
+        * always stay cached it must maintain a hold on them.
+        */
+       if (strcmp(top->ld_type->ldt_name, LUSTRE_OSD_ZFS_NAME) == 0) {
+               lu_cache_percent = 1;
+               lu_cache_nr = LU_CACHE_NR_ZFS_LIMIT;
+               return LU_SITE_BITS_MIN;
+       }
+
         /*
          * Calculate hash table size, assuming that we want reasonable
          * performance when 20% of total memory is occupied by cache of
         /*
          * Calculate hash table size, assuming that we want reasonable
          * performance when 20% of total memory is occupied by cache of
@@ -1020,47 +1082,39 @@ void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
 EXPORT_SYMBOL(lu_dev_del_linkage);
 
 /**
 EXPORT_SYMBOL(lu_dev_del_linkage);
 
 /**
- * Initialize site \a s, with \a d as the top level device.
- */
-#define LU_SITE_BITS_MIN    12
-#define LU_SITE_BITS_MAX    24
-/**
- * total 256 buckets, we don't want too many buckets because:
- * - consume too much memory
- * - avoid unbalanced LRU list
- */
-#define LU_SITE_BKT_BITS    8
-
+  * Initialize site \a s, with \a d as the top level device.
+  */
 int lu_site_init(struct lu_site *s, struct lu_device *top)
 {
 int lu_site_init(struct lu_site *s, struct lu_device *top)
 {
-        struct lu_site_bkt_data *bkt;
-        cfs_hash_bd_t bd;
-        char name[16];
-        int bits;
-        int i;
-        ENTRY;
-
-        memset(s, 0, sizeof *s);
-        bits = lu_htable_order();
-        snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
-        for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
-             bits >= LU_SITE_BITS_MIN; bits--) {
-                s->ls_obj_hash = cfs_hash_create(name, bits, bits,
-                                                 bits - LU_SITE_BKT_BITS,
-                                                 sizeof(*bkt), 0, 0,
-                                                 &lu_site_hash_ops,
-                                                 CFS_HASH_SPIN_BKTLOCK |
-                                                 CFS_HASH_NO_ITEMREF |
-                                                 CFS_HASH_DEPTH |
-                                                 CFS_HASH_ASSERT_EMPTY);
-                if (s->ls_obj_hash != NULL)
-                        break;
-        }
+       struct lu_site_bkt_data *bkt;
+       cfs_hash_bd_t bd;
+       char name[16];
+       int bits;
+       int i;
+       ENTRY;
 
 
-        if (s->ls_obj_hash == NULL) {
-                CERROR("failed to create lu_site hash with bits: %d\n", bits);
-                return -ENOMEM;
-        }
+       memset(s, 0, sizeof *s);
+       bits = lu_htable_order(top);
+       snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+       for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+            bits >= LU_SITE_BITS_MIN; bits--) {
+               s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+                                                bits - LU_SITE_BKT_BITS,
+                                                sizeof(*bkt), 0, 0,
+                                                &lu_site_hash_ops,
+                                                CFS_HASH_SPIN_BKTLOCK |
+                                                CFS_HASH_NO_ITEMREF |
+                                                CFS_HASH_DEPTH |
+                                                CFS_HASH_ASSERT_EMPTY |
+                                                CFS_HASH_COUNTER);
+               if (s->ls_obj_hash != NULL)
+                       break;
+       }
+
+       if (s->ls_obj_hash == NULL) {
+               CERROR("failed to create lu_site hash with bits: %d\n", bits);
+               return -ENOMEM;
+       }
 
        cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
                bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
 
        cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
                bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);