From 0123baecc4e2050447f8c4f48f5b33a6d3c524a8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 24 Apr 2014 13:47:47 -0700 Subject: [PATCH] LU-5164 osd: Limit lu_object cache For OSDs like ZFS to perform optimally it's import that they be allowed to manage their own cache. This maximizes the likelyhood that the ARC will prefetch and cache the right buffers. In the existing ZFS OSD code a cached LU object pins buffers in the ARC preventing them from being dropped. As the LU cache grows it can consume the entire ARC preventing buffers for other objects, such as the OIs, from being cached and severely impacting the performance for FID lookups. By default this patch will only limit the LU cache for ZFS OSDs. NOTES: * Setting LU_CACHE_NR_ZFS_LIMIT to 0 results in an LBUG on the MDS. This may be because an object is being used without a reference. Setting a minimum value of 256 was arbitrary, ideally we would set this value to 0. * In order to be able to quickly determine the number of objects in the hash table the CFS_HASH_COUNTER flag is added. This adds an atomic_inc/dec to the hash insert/remove paths but is not expected to have any measurable impact of performance. Signed-off-by: Brian Behlendorf Change-Id: Ia64838d50395f1d22e558631adbfa39d44e91606 Patch-Set: 3 Reviewed-on: http://review.whamcloud.com/10237 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alex Zhuravlev Reviewed-by: Isaac Huang Reviewed-by: Oleg Drokin --- lustre/obdclass/lu_object.c | 150 ++++++++++++++++++++++++++++++-------------- 1 file changed, 102 insertions(+), 48 deletions(-) diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c index 3fac810..fe43331 100644 --- a/lustre/obdclass/lu_object.c +++ b/lustre/obdclass/lu_object.c @@ -62,6 +62,35 @@ extern spinlock_t obd_types_lock; +enum { + LU_CACHE_PERCENT_MAX = 50, + LU_CACHE_PERCENT_DEFAULT = 20 +}; + +#define LU_CACHE_NR_MAX_ADJUST 128 +#define LU_CACHE_NR_UNLIMITED -1 +#define LU_CACHE_NR_DEFAULT LU_CACHE_NR_UNLIMITED +#define LU_CACHE_NR_LDISKFS_LIMIT LU_CACHE_NR_UNLIMITED +#define LU_CACHE_NR_ZFS_LIMIT 256 + +#define LU_SITE_BITS_MIN 12 +#define LU_SITE_BITS_MAX 24 +/** + * total 256 buckets, we don't want too many buckets because: + * - consume too much memory + * - avoid unbalanced LRU list + */ +#define LU_SITE_BKT_BITS 8 + + +static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; +CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644, + "Percentage of memory to be used as lu_object cache"); + +static long lu_cache_nr = LU_CACHE_NR_DEFAULT; +CFS_MODULE_PARM(lu_cache_nr, "l", long, 0644, + "Maximum number of objects in lu_object cache"); + static void lu_object_free(const struct lu_env *env, struct lu_object *o); /** @@ -621,6 +650,30 @@ struct lu_object *lu_object_find(const struct lu_env *env, } EXPORT_SYMBOL(lu_object_find); +/* + * Limit the lu_object cache to a maximum of lu_cache_nr objects. Because + * the calculation for the number of objects to reclaim is not covered by + * a lock the maximum number of objects is capped by LU_CACHE_MAX_ADJUST. + * This ensures that many concurrent threads will not accidentally purge + * the entire cache. + */ +static void lu_object_limit(const struct lu_env *env, + struct lu_device *dev) +{ + __u64 size, nr; + + if (lu_cache_nr == LU_CACHE_NR_UNLIMITED) + return; + + size = cfs_hash_size_get(dev->ld_site->ls_obj_hash); + nr = (__u64)lu_cache_nr; + if (size > nr) + lu_site_purge(env, dev->ld_site, + MIN(size - nr, LU_CACHE_NR_MAX_ADJUST)); + + return; +} + static struct lu_object *lu_object_new(const struct lu_env *env, struct lu_device *dev, const struct lu_fid *f, @@ -641,6 +694,9 @@ static struct lu_object *lu_object_new(const struct lu_env *env, cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); bkt->lsb_busy++; cfs_hash_bd_unlock(hs, &bd, 1); + + lu_object_limit(env, dev); + return o; } @@ -711,6 +767,9 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env, cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); bkt->lsb_busy++; cfs_hash_bd_unlock(hs, &bd, 1); + + lu_object_limit(env, dev); + return o; } @@ -883,23 +942,26 @@ void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, } EXPORT_SYMBOL(lu_site_print); -enum { - LU_CACHE_PERCENT_MAX = 50, - LU_CACHE_PERCENT_DEFAULT = 20 -}; - -static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; -CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644, - "Percentage of memory to be used as lu_object cache"); - /** * Return desired hash table order. */ -static int lu_htable_order(void) +static int lu_htable_order(struct lu_device *top) { unsigned long cache_size; int bits; + /* + * For ZFS based OSDs the cache should be disabled by default. This + * allows the ZFS ARC maximum flexibility in determining what buffers + * to cache. If Lustre has objects or buffer which it wants to ensure + * always stay cached it must maintain a hold on them. + */ + if (strcmp(top->ld_type->ldt_name, LUSTRE_OSD_ZFS_NAME) == 0) { + lu_cache_percent = 1; + lu_cache_nr = LU_CACHE_NR_ZFS_LIMIT; + return LU_SITE_BITS_MIN; + } + /* * Calculate hash table size, assuming that we want reasonable * performance when 20% of total memory is occupied by cache of @@ -1020,47 +1082,39 @@ void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d) EXPORT_SYMBOL(lu_dev_del_linkage); /** - * Initialize site \a s, with \a d as the top level device. - */ -#define LU_SITE_BITS_MIN 12 -#define LU_SITE_BITS_MAX 24 -/** - * total 256 buckets, we don't want too many buckets because: - * - consume too much memory - * - avoid unbalanced LRU list - */ -#define LU_SITE_BKT_BITS 8 - + * Initialize site \a s, with \a d as the top level device. + */ int lu_site_init(struct lu_site *s, struct lu_device *top) { - struct lu_site_bkt_data *bkt; - cfs_hash_bd_t bd; - char name[16]; - int bits; - int i; - ENTRY; - - memset(s, 0, sizeof *s); - bits = lu_htable_order(); - snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name); - for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX); - bits >= LU_SITE_BITS_MIN; bits--) { - s->ls_obj_hash = cfs_hash_create(name, bits, bits, - bits - LU_SITE_BKT_BITS, - sizeof(*bkt), 0, 0, - &lu_site_hash_ops, - CFS_HASH_SPIN_BKTLOCK | - CFS_HASH_NO_ITEMREF | - CFS_HASH_DEPTH | - CFS_HASH_ASSERT_EMPTY); - if (s->ls_obj_hash != NULL) - break; - } + struct lu_site_bkt_data *bkt; + cfs_hash_bd_t bd; + char name[16]; + int bits; + int i; + ENTRY; - if (s->ls_obj_hash == NULL) { - CERROR("failed to create lu_site hash with bits: %d\n", bits); - return -ENOMEM; - } + memset(s, 0, sizeof *s); + bits = lu_htable_order(top); + snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name); + for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX); + bits >= LU_SITE_BITS_MIN; bits--) { + s->ls_obj_hash = cfs_hash_create(name, bits, bits, + bits - LU_SITE_BKT_BITS, + sizeof(*bkt), 0, 0, + &lu_site_hash_ops, + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF | + CFS_HASH_DEPTH | + CFS_HASH_ASSERT_EMPTY | + CFS_HASH_COUNTER); + if (s->ls_obj_hash != NULL) + break; + } + + if (s->ls_obj_hash == NULL) { + CERROR("failed to create lu_site hash with bits: %d\n", bits); + return -ENOMEM; + } cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); -- 1.8.3.1